diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,78466 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0036764705882355, + "eval_steps": 500, + "global_step": 2451, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 211.5, + "completions/mean_terminated_length": 211.5, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.28229743242263794, + "epoch": 0.0012254901960784314, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7479166325555309, + "kl": 0.0, + "learning_rate": 0.0, + "loss": -0.0063, + "num_tokens": 29328.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5957260131835938, + "sampling/importance_sampling_ratio/mean": 1.0002766847610474, + "sampling/importance_sampling_ratio/min": 0.6407270431518555, + "sampling/sampling_logp_difference/max": 0.4673287868499756, + "sampling/sampling_logp_difference/mean": 0.013949751853942871, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 172.125, + "completions/mean_terminated_length": 172.125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3556089997291565, + "epoch": 0.0024509803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.065040650406504e-09, + "loss": 0.0, + "num_tokens": 56536.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6171398162841797, + "sampling/importance_sampling_ratio/mean": 1.0003067255020142, + "sampling/importance_sampling_ratio/min": 0.6687158942222595, + "sampling/sampling_logp_difference/max": 0.48065900802612305, + "sampling/sampling_logp_difference/mean": 0.016711918637156487, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 199.578125, + "completions/mean_terminated_length": 199.578125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4099624752998352, + "epoch": 0.003676470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002462603523173163, + "kl": 0.0005740458145737648, + "learning_rate": 8.130081300813008e-09, + "loss": 0.0, + "num_tokens": 87501.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.384435772895813, + "sampling/importance_sampling_ratio/mean": 1.000922679901123, + "sampling/importance_sampling_ratio/min": 0.6132686734199524, + "sampling/sampling_logp_difference/max": 0.4889521598815918, + "sampling/sampling_logp_difference/mean": 0.017119944095611572, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 233.375, + "completions/mean_terminated_length": 233.375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.38932323455810547, + "epoch": 0.004901960784313725, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8683691777653537, + "kl": 0.00048791104927659035, + "learning_rate": 1.2195121951219512e-08, + "loss": -0.0066, + "num_tokens": 121221.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8221690654754639, + "sampling/importance_sampling_ratio/mean": 0.999820351600647, + "sampling/importance_sampling_ratio/min": 0.6315130591392517, + "sampling/sampling_logp_difference/max": 0.6000275611877441, + "sampling/sampling_logp_difference/mean": 0.015830399468541145, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 242.5625, + "completions/mean_terminated_length": 242.5625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.3684294819831848, + "epoch": 0.006127450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0458740847410508, + "kl": 0.0005367250414565206, + "learning_rate": 1.6260162601626016e-08, + "loss": 0.0137, + "num_tokens": 161993.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6047180891036987, + "sampling/importance_sampling_ratio/mean": 1.0001524686813354, + "sampling/importance_sampling_ratio/min": 0.6081744432449341, + "sampling/sampling_logp_difference/max": 0.49729347229003906, + "sampling/sampling_logp_difference/mean": 0.015667764469981194, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 174.859375, + "completions/mean_terminated_length": 174.859375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.355221152305603, + "epoch": 0.007352941176470588, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9879967709804532, + "kl": 0.0006942846230231225, + "learning_rate": 2.032520325203252e-08, + "loss": -0.006, + "num_tokens": 188608.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6463251113891602, + "sampling/importance_sampling_ratio/mean": 1.000199317932129, + "sampling/importance_sampling_ratio/min": 0.5885282158851624, + "sampling/sampling_logp_difference/max": 0.5301303863525391, + "sampling/sampling_logp_difference/mean": 0.016422923654317856, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 210.875, + "completions/mean_terminated_length": 210.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3294309675693512, + "epoch": 0.00857843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.38987441779447, + "kl": 0.0005754978628829122, + "learning_rate": 2.4390243902439023e-08, + "loss": 0.0212, + "num_tokens": 220840.0, + "reward": 0.75, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.618558406829834, + "sampling/importance_sampling_ratio/mean": 0.9997539520263672, + "sampling/importance_sampling_ratio/min": 0.638060986995697, + "sampling/sampling_logp_difference/max": 0.4815359115600586, + "sampling/sampling_logp_difference/mean": 0.014915119856595993, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 173.65625, + "completions/mean_terminated_length": 173.65625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.33604592084884644, + "epoch": 0.00980392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002185939782348619, + "kl": 0.0005290215485729277, + "learning_rate": 2.8455284552845527e-08, + "loss": 0.0, + "num_tokens": 249602.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2978038787841797, + "sampling/importance_sampling_ratio/mean": 1.0001516342163086, + "sampling/importance_sampling_ratio/min": 0.6171172261238098, + "sampling/sampling_logp_difference/max": 0.4826962947845459, + "sampling/sampling_logp_difference/mean": 0.014389926567673683, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 185.3125, + "completions/mean_terminated_length": 185.3125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.376762330532074, + "epoch": 0.011029411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2913004060570497, + "kl": 0.0006159727345220745, + "learning_rate": 3.252032520325203e-08, + "loss": 0.0153, + "num_tokens": 283590.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3472496271133423, + "sampling/importance_sampling_ratio/mean": 0.9995624423027039, + "sampling/importance_sampling_ratio/min": 0.6078652739524841, + "sampling/sampling_logp_difference/max": 0.4978019595146179, + "sampling/sampling_logp_difference/mean": 0.016848571598529816, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 196.53125, + "completions/mean_terminated_length": 196.53125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.36589953303337097, + "epoch": 0.012254901960784314, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9788700484841335, + "kl": 0.0006352112395688891, + "learning_rate": 3.658536585365853e-08, + "loss": -0.0314, + "num_tokens": 316776.0, + "reward": -0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4398318529129028, + "sampling/importance_sampling_ratio/mean": 0.9998399019241333, + "sampling/importance_sampling_ratio/min": 0.6124805212020874, + "sampling/sampling_logp_difference/max": 0.4902381896972656, + "sampling/sampling_logp_difference/mean": 0.016363628208637238, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 188.140625, + "completions/mean_terminated_length": 188.140625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3949275016784668, + "epoch": 0.013480392156862746, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.742921810360062, + "kl": 0.0007006666273809969, + "learning_rate": 4.065040650406504e-08, + "loss": 0.0184, + "num_tokens": 345569.0, + "reward": 0.3125, + "reward_std": 0.6116957664489746, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6221922636032104, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.6204392910003662, + "sampling/sampling_logp_difference/max": 0.4837784767150879, + "sampling/sampling_logp_difference/mean": 0.017850767821073532, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 208.46875, + "completions/mean_terminated_length": 208.46875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.31998831033706665, + "epoch": 0.014705882352941176, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1855691193334834, + "kl": 0.0006962069310247898, + "learning_rate": 4.4715447154471546e-08, + "loss": -0.0278, + "num_tokens": 375807.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5744433403015137, + "sampling/importance_sampling_ratio/mean": 0.9997034072875977, + "sampling/importance_sampling_ratio/min": 0.6303018927574158, + "sampling/sampling_logp_difference/max": 0.46155643463134766, + "sampling/sampling_logp_difference/mean": 0.014594350941479206, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 172.109375, + "completions/mean_terminated_length": 172.109375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3764714300632477, + "epoch": 0.015931372549019607, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0914280328623476, + "kl": 0.0006829964695498347, + "learning_rate": 4.878048780487805e-08, + "loss": 0.0069, + "num_tokens": 401190.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.623640537261963, + "sampling/importance_sampling_ratio/mean": 1.0012812614440918, + "sampling/importance_sampling_ratio/min": 0.6074984669685364, + "sampling/sampling_logp_difference/max": 0.49840569496154785, + "sampling/sampling_logp_difference/mean": 0.017124183475971222, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 174.265625, + "completions/mean_terminated_length": 174.265625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.31244346499443054, + "epoch": 0.01715686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0529020435196639, + "kl": 0.0006743941339664161, + "learning_rate": 5.2845528455284554e-08, + "loss": -0.0149, + "num_tokens": 427351.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5630708932876587, + "sampling/importance_sampling_ratio/mean": 1.0000364780426025, + "sampling/importance_sampling_ratio/min": 0.6112802028656006, + "sampling/sampling_logp_difference/max": 0.4921998977661133, + "sampling/sampling_logp_difference/mean": 0.01569565385580063, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 186.65625, + "completions/mean_terminated_length": 186.65625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.2303016185760498, + "epoch": 0.01838235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002993495547442926, + "kl": 0.0005023834528401494, + "learning_rate": 5.6910569105691055e-08, + "loss": 0.0, + "num_tokens": 456849.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.596113681793213, + "sampling/importance_sampling_ratio/mean": 0.9996975660324097, + "sampling/importance_sampling_ratio/min": 0.48819631338119507, + "sampling/sampling_logp_difference/max": 0.7170376777648926, + "sampling/sampling_logp_difference/mean": 0.011747606098651886, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 219.90625, + "completions/mean_terminated_length": 219.90625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.42675280570983887, + "epoch": 0.0196078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9331486011235821, + "kl": 0.0005129431374371052, + "learning_rate": 6.097560975609756e-08, + "loss": 0.0012, + "num_tokens": 496571.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6089171171188354, + "sampling/importance_sampling_ratio/mean": 1.0008577108383179, + "sampling/importance_sampling_ratio/min": 0.6093137860298157, + "sampling/sampling_logp_difference/max": 0.4954218864440918, + "sampling/sampling_logp_difference/mean": 0.017686408013105392, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 181.96875, + "completions/mean_terminated_length": 181.96875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3967221975326538, + "epoch": 0.020833333333333332, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.401103340712179, + "kl": 0.0006027425406500697, + "learning_rate": 6.504065040650406e-08, + "loss": 0.0146, + "num_tokens": 524617.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6117963790893555, + "sampling/importance_sampling_ratio/mean": 1.0005756616592407, + "sampling/importance_sampling_ratio/min": 0.6582546830177307, + "sampling/sampling_logp_difference/max": 0.47734928131103516, + "sampling/sampling_logp_difference/mean": 0.016756337136030197, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 185.65625, + "completions/mean_terminated_length": 185.65625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2854838967323303, + "epoch": 0.022058823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0027790260821808736, + "kl": 0.000610773335210979, + "learning_rate": 6.910569105691057e-08, + "loss": 0.0, + "num_tokens": 551203.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5293587446212769, + "sampling/importance_sampling_ratio/mean": 0.9998745322227478, + "sampling/importance_sampling_ratio/min": 0.6232456564903259, + "sampling/sampling_logp_difference/max": 0.47281455993652344, + "sampling/sampling_logp_difference/mean": 0.014073856174945831, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 225.515625, + "completions/mean_terminated_length": 225.515625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4128747284412384, + "epoch": 0.023284313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1387017471697445, + "kl": 0.0005442682886496186, + "learning_rate": 7.317073170731706e-08, + "loss": 0.0208, + "num_tokens": 586180.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6147140264511108, + "sampling/importance_sampling_ratio/mean": 0.9997754693031311, + "sampling/importance_sampling_ratio/min": 0.4225813150405884, + "sampling/sampling_logp_difference/max": 0.8613734245300293, + "sampling/sampling_logp_difference/mean": 0.016178175806999207, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 175.5, + "completions/mean_terminated_length": 175.5, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.34866586327552795, + "epoch": 0.024509803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.127891737670114, + "kl": 0.0005755886086262763, + "learning_rate": 7.723577235772358e-08, + "loss": 0.0189, + "num_tokens": 617060.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.489638328552246, + "sampling/importance_sampling_ratio/mean": 0.9997918009757996, + "sampling/importance_sampling_ratio/min": 0.6267635226249695, + "sampling/sampling_logp_difference/max": 0.46718597412109375, + "sampling/sampling_logp_difference/mean": 0.015172285959124565, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 174.75, + "completions/mean_terminated_length": 174.75, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3468027412891388, + "epoch": 0.025735294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9147208366057095, + "kl": 0.0005272025009617209, + "learning_rate": 8.130081300813008e-08, + "loss": -0.0192, + "num_tokens": 645860.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.622986912727356, + "sampling/importance_sampling_ratio/mean": 0.9999600052833557, + "sampling/importance_sampling_ratio/min": 0.6193138360977173, + "sampling/sampling_logp_difference/max": 0.4842681884765625, + "sampling/sampling_logp_difference/mean": 0.01507607288658619, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 180.984375, + "completions/mean_terminated_length": 180.984375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.29409506916999817, + "epoch": 0.02696078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.071189239381354, + "kl": 0.0004567463183775544, + "learning_rate": 8.536585365853659e-08, + "loss": -0.0218, + "num_tokens": 677347.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5818450450897217, + "sampling/importance_sampling_ratio/mean": 0.9996393918991089, + "sampling/importance_sampling_ratio/min": 0.4838048219680786, + "sampling/sampling_logp_difference/max": 0.7260737419128418, + "sampling/sampling_logp_difference/mean": 0.013134635984897614, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 227.15625, + "completions/mean_terminated_length": 227.15625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4192052483558655, + "epoch": 0.028186274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8350320483904694, + "kl": 0.000575724639929831, + "learning_rate": 8.943089430894309e-08, + "loss": -0.0138, + "num_tokens": 711885.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4994769096374512, + "sampling/importance_sampling_ratio/mean": 0.9990925192832947, + "sampling/importance_sampling_ratio/min": 0.37036260962486267, + "sampling/sampling_logp_difference/max": 0.9932727813720703, + "sampling/sampling_logp_difference/mean": 0.017731059342622757, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 182.71875, + "completions/mean_terminated_length": 182.71875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3272732198238373, + "epoch": 0.029411764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1614412937586946, + "kl": 0.0005397280328907073, + "learning_rate": 9.349593495934959e-08, + "loss": 0.0425, + "num_tokens": 739595.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5641429424285889, + "sampling/importance_sampling_ratio/mean": 0.9999706745147705, + "sampling/importance_sampling_ratio/min": 0.6439087986946106, + "sampling/sampling_logp_difference/max": 0.4473381042480469, + "sampling/sampling_logp_difference/mean": 0.01354675181210041, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 230.0, + "completions/mean_terminated_length": 230.0, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.46684208512306213, + "epoch": 0.030637254901960783, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031829255161353784, + "kl": 0.0006086308276280761, + "learning_rate": 9.75609756097561e-08, + "loss": 0.0, + "num_tokens": 774955.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6248189210891724, + "sampling/importance_sampling_ratio/mean": 0.9997011423110962, + "sampling/importance_sampling_ratio/min": 0.6368674039840698, + "sampling/sampling_logp_difference/max": 0.4853963851928711, + "sampling/sampling_logp_difference/mean": 0.018731631338596344, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 177.09375, + "completions/mean_terminated_length": 177.09375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.26852524280548096, + "epoch": 0.031862745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003296172078637039, + "kl": 0.0005129881319589913, + "learning_rate": 1.016260162601626e-07, + "loss": 0.0, + "num_tokens": 802769.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5277632474899292, + "sampling/importance_sampling_ratio/mean": 1.000476598739624, + "sampling/importance_sampling_ratio/min": 0.6301831007003784, + "sampling/sampling_logp_difference/max": 0.4617447853088379, + "sampling/sampling_logp_difference/mean": 0.012203315272927284, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 210.625, + "completions/mean_terminated_length": 210.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.28598493337631226, + "epoch": 0.03308823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0017730737089246148, + "kl": 0.00047270144568756223, + "learning_rate": 1.0569105691056911e-07, + "loss": 0.0, + "num_tokens": 838137.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6515837907791138, + "sampling/importance_sampling_ratio/mean": 1.000171422958374, + "sampling/importance_sampling_ratio/min": 0.6097134351730347, + "sampling/sampling_logp_difference/max": 0.501734733581543, + "sampling/sampling_logp_difference/mean": 0.013479228131473064, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 202.046875, + "completions/mean_terminated_length": 202.046875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.4291865825653076, + "epoch": 0.03431372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7712373761585439, + "kl": 0.0006866075564175844, + "learning_rate": 1.097560975609756e-07, + "loss": 0.0068, + "num_tokens": 867660.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.7520534992218018, + "sampling/importance_sampling_ratio/mean": 0.999358594417572, + "sampling/importance_sampling_ratio/min": 0.6255505681037903, + "sampling/sampling_logp_difference/max": 0.5607885122299194, + "sampling/sampling_logp_difference/mean": 0.018599187955260277, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 194.671875, + "completions/mean_terminated_length": 194.671875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3803984522819519, + "epoch": 0.03553921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3284860712625153, + "kl": 0.0005068336031399667, + "learning_rate": 1.1382113821138211e-07, + "loss": -0.0224, + "num_tokens": 898327.0, + "reward": 0.59375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.7105915546417236, + "sampling/importance_sampling_ratio/mean": 0.9996547698974609, + "sampling/importance_sampling_ratio/min": 0.6176413893699646, + "sampling/sampling_logp_difference/max": 0.5368392467498779, + "sampling/sampling_logp_difference/mean": 0.015175139531493187, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 170.203125, + "completions/mean_terminated_length": 170.203125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3811706006526947, + "epoch": 0.03676470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1298075740240563, + "kl": 0.0007677193498238921, + "learning_rate": 1.1788617886178862e-07, + "loss": 0.001, + "num_tokens": 922660.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6084727048873901, + "sampling/importance_sampling_ratio/mean": 1.0000172853469849, + "sampling/importance_sampling_ratio/min": 0.6318622827529907, + "sampling/sampling_logp_difference/max": 0.47528505325317383, + "sampling/sampling_logp_difference/mean": 0.01936884969472885, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 196.421875, + "completions/mean_terminated_length": 196.421875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3232734501361847, + "epoch": 0.03799019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002693476953860462, + "kl": 0.000636767887044698, + "learning_rate": 1.219512195121951e-07, + "loss": 0.0, + "num_tokens": 950223.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6622308492660522, + "sampling/importance_sampling_ratio/mean": 1.000096321105957, + "sampling/importance_sampling_ratio/min": 0.6301569938659668, + "sampling/sampling_logp_difference/max": 0.5081605911254883, + "sampling/sampling_logp_difference/mean": 0.015153428539633751, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 182.0625, + "completions/mean_terminated_length": 182.0625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3069888949394226, + "epoch": 0.0392156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030196274756335035, + "kl": 0.0006638698978349566, + "learning_rate": 1.260162601626016e-07, + "loss": 0.0, + "num_tokens": 986755.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8526816368103027, + "sampling/importance_sampling_ratio/mean": 1.000173807144165, + "sampling/importance_sampling_ratio/min": 0.6228201985359192, + "sampling/sampling_logp_difference/max": 0.6166341304779053, + "sampling/sampling_logp_difference/mean": 0.014696375466883183, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 222.28125, + "completions/mean_terminated_length": 222.28125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3425103425979614, + "epoch": 0.04044117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7401089766711731, + "kl": 0.0005140869179740548, + "learning_rate": 1.3008130081300813e-07, + "loss": 0.0137, + "num_tokens": 1019797.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6463189125061035, + "sampling/importance_sampling_ratio/mean": 0.9994274377822876, + "sampling/importance_sampling_ratio/min": 0.6264773011207581, + "sampling/sampling_logp_difference/max": 0.49854183197021484, + "sampling/sampling_logp_difference/mean": 0.014486259780824184, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 220.4375, + "completions/mean_terminated_length": 220.4375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.35915452241897583, + "epoch": 0.041666666666666664, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4135222586157743, + "kl": 0.0005540281417779624, + "learning_rate": 1.3414634146341465e-07, + "loss": -0.0628, + "num_tokens": 1053873.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.629637360572815, + "sampling/importance_sampling_ratio/mean": 1.0001004934310913, + "sampling/importance_sampling_ratio/min": 0.5777447819709778, + "sampling/sampling_logp_difference/max": 0.5486230850219727, + "sampling/sampling_logp_difference/mean": 0.014785278588533401, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 195.3125, + "completions/mean_terminated_length": 195.3125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3688523769378662, + "epoch": 0.0428921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9374397758001798, + "kl": 0.000617923797108233, + "learning_rate": 1.3821138211382114e-07, + "loss": 0.0148, + "num_tokens": 1086149.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3920141458511353, + "sampling/importance_sampling_ratio/mean": 0.9999972581863403, + "sampling/importance_sampling_ratio/min": 0.622948169708252, + "sampling/sampling_logp_difference/max": 0.47329187393188477, + "sampling/sampling_logp_difference/mean": 0.01586540974676609, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 170.046875, + "completions/mean_terminated_length": 170.046875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.35302817821502686, + "epoch": 0.04411764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002595660226809006, + "kl": 0.000633524265140295, + "learning_rate": 1.4227642276422763e-07, + "loss": 0.0, + "num_tokens": 1117704.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6610043048858643, + "sampling/importance_sampling_ratio/mean": 0.999608039855957, + "sampling/importance_sampling_ratio/min": 0.6538148522377014, + "sampling/sampling_logp_difference/max": 0.5074224472045898, + "sampling/sampling_logp_difference/mean": 0.016183484345674515, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 227.875, + "completions/mean_terminated_length": 227.875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.45303717255592346, + "epoch": 0.04534313725490196, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.60602766171904, + "kl": 0.0005130674107931554, + "learning_rate": 1.4634146341463413e-07, + "loss": 0.0166, + "num_tokens": 1151184.0, + "reward": 0.4375, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3496991395950317, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.6291665434837341, + "sampling/sampling_logp_difference/max": 0.46335935592651367, + "sampling/sampling_logp_difference/mean": 0.01684834063053131, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 218.3125, + "completions/mean_terminated_length": 218.3125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3316297233104706, + "epoch": 0.04656862745098039, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3069275862513603, + "kl": 0.0005636264686472714, + "learning_rate": 1.5040650406504065e-07, + "loss": -0.0063, + "num_tokens": 1185940.0, + "reward": 0.28125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.495247483253479, + "sampling/importance_sampling_ratio/mean": 1.000683307647705, + "sampling/importance_sampling_ratio/min": 0.6321820616722107, + "sampling/sampling_logp_difference/max": 0.45857787132263184, + "sampling/sampling_logp_difference/mean": 0.014935510233044624, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 212.828125, + "completions/mean_terminated_length": 212.828125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.43204957246780396, + "epoch": 0.04779411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1820639433995703, + "kl": 0.0007093902095220983, + "learning_rate": 1.5447154471544717e-07, + "loss": 0.0263, + "num_tokens": 1213913.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.620730996131897, + "sampling/importance_sampling_ratio/mean": 1.0002366304397583, + "sampling/importance_sampling_ratio/min": 0.6387297511100769, + "sampling/sampling_logp_difference/max": 0.482877254486084, + "sampling/sampling_logp_difference/mean": 0.017711803317070007, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 191.453125, + "completions/mean_terminated_length": 191.453125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.4706189036369324, + "epoch": 0.049019607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0024790988863974115, + "kl": 0.0006636378820985556, + "learning_rate": 1.5853658536585366e-07, + "loss": 0.0, + "num_tokens": 1244230.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.614878535270691, + "sampling/importance_sampling_ratio/mean": 1.0000481605529785, + "sampling/importance_sampling_ratio/min": 0.6086853742599487, + "sampling/sampling_logp_difference/max": 0.49645376205444336, + "sampling/sampling_logp_difference/mean": 0.017970986664295197, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 204.125, + "completions/mean_terminated_length": 204.125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4288448393344879, + "epoch": 0.05024509803921569, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6378922477411386, + "kl": 0.0005438131629489362, + "learning_rate": 1.6260162601626016e-07, + "loss": 0.027, + "num_tokens": 1273534.0, + "reward": 0.25, + "reward_std": 0.7623475193977356, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6325310468673706, + "sampling/importance_sampling_ratio/mean": 0.9998799562454224, + "sampling/importance_sampling_ratio/min": 0.5624527931213379, + "sampling/sampling_logp_difference/max": 0.5754480361938477, + "sampling/sampling_logp_difference/mean": 0.01654253527522087, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 187.546875, + "completions/mean_terminated_length": 187.546875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3429829180240631, + "epoch": 0.051470588235294115, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.885346806108457, + "kl": 0.0006208082195371389, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0071, + "num_tokens": 1303201.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.473521113395691, + "sampling/importance_sampling_ratio/mean": 1.0007667541503906, + "sampling/importance_sampling_ratio/min": 0.679732620716095, + "sampling/sampling_logp_difference/max": 0.38765478134155273, + "sampling/sampling_logp_difference/mean": 0.015743326395750046, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 149.125, + "completions/mean_terminated_length": 149.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3522500991821289, + "epoch": 0.05269607843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5506841207393454, + "kl": 0.0006877593696117401, + "learning_rate": 1.7073170731707317e-07, + "loss": 0.0157, + "num_tokens": 1327577.0, + "reward": 0.53125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4399316310882568, + "sampling/importance_sampling_ratio/mean": 0.99951171875, + "sampling/importance_sampling_ratio/min": 0.5969815254211426, + "sampling/sampling_logp_difference/max": 0.515869140625, + "sampling/sampling_logp_difference/mean": 0.016115540638566017, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 185.1875, + "completions/mean_terminated_length": 185.1875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.40408167243003845, + "epoch": 0.05392156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9268900876701384, + "kl": 0.0006102619227021933, + "learning_rate": 1.7479674796747966e-07, + "loss": -0.0044, + "num_tokens": 1357861.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6210482120513916, + "sampling/importance_sampling_ratio/mean": 0.9999095797538757, + "sampling/importance_sampling_ratio/min": 0.6182252764701843, + "sampling/sampling_logp_difference/max": 0.48307299613952637, + "sampling/sampling_logp_difference/mean": 0.01602936163544655, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 231.109375, + "completions/mean_terminated_length": 231.109375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.42715632915496826, + "epoch": 0.05514705882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1764580024959685, + "kl": 0.0006182037759572268, + "learning_rate": 1.7886178861788619e-07, + "loss": -0.0593, + "num_tokens": 1402540.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6230255365371704, + "sampling/importance_sampling_ratio/mean": 1.0002163648605347, + "sampling/importance_sampling_ratio/min": 0.5318665504455566, + "sampling/sampling_logp_difference/max": 0.6313626766204834, + "sampling/sampling_logp_difference/mean": 0.017635690048336983, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 172.46875, + "completions/mean_terminated_length": 172.46875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3358648121356964, + "epoch": 0.056372549019607844, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0010274065990643, + "kl": 0.0006443964084610343, + "learning_rate": 1.8292682926829268e-07, + "loss": 0.0242, + "num_tokens": 1425498.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4986183643341064, + "sampling/importance_sampling_ratio/mean": 0.9992099404335022, + "sampling/importance_sampling_ratio/min": 0.6184446215629578, + "sampling/sampling_logp_difference/max": 0.4805476665496826, + "sampling/sampling_logp_difference/mean": 0.016474541276693344, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 189.875, + "completions/mean_terminated_length": 189.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3795761466026306, + "epoch": 0.05759803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029523160938459856, + "kl": 0.0006913819233886898, + "learning_rate": 1.8699186991869917e-07, + "loss": 0.0, + "num_tokens": 1453602.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4270973205566406, + "sampling/importance_sampling_ratio/mean": 0.9996356964111328, + "sampling/importance_sampling_ratio/min": 0.6219667792320251, + "sampling/sampling_logp_difference/max": 0.4748685359954834, + "sampling/sampling_logp_difference/mean": 0.016697335988283157, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 183.875, + "completions/mean_terminated_length": 183.875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.28151488304138184, + "epoch": 0.058823529411764705, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0028058067350995694, + "kl": 0.000663014012388885, + "learning_rate": 1.910569105691057e-07, + "loss": 0.0, + "num_tokens": 1480122.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.623085856437683, + "sampling/importance_sampling_ratio/mean": 1.0003767013549805, + "sampling/importance_sampling_ratio/min": 0.6483780145645142, + "sampling/sampling_logp_difference/max": 0.4843292236328125, + "sampling/sampling_logp_difference/mean": 0.014155544340610504, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 170.28125, + "completions/mean_terminated_length": 170.28125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.43864426016807556, + "epoch": 0.06004901960784314, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9550066819946098, + "kl": 0.0007230397895909846, + "learning_rate": 1.951219512195122e-07, + "loss": -0.0301, + "num_tokens": 1509404.0, + "reward": -0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4191150665283203, + "sampling/importance_sampling_ratio/mean": 1.0003002882003784, + "sampling/importance_sampling_ratio/min": 0.37228211760520935, + "sampling/sampling_logp_difference/max": 0.9881033897399902, + "sampling/sampling_logp_difference/mean": 0.0182917769998312, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 173.3125, + "completions/mean_terminated_length": 173.3125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3217679262161255, + "epoch": 0.061274509803921566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004175269613817593, + "kl": 0.0007267352193593979, + "learning_rate": 1.9918699186991868e-07, + "loss": 0.0, + "num_tokens": 1537168.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3975340127944946, + "sampling/importance_sampling_ratio/mean": 1.000385046005249, + "sampling/importance_sampling_ratio/min": 0.4469026029109955, + "sampling/sampling_logp_difference/max": 0.8054146766662598, + "sampling/sampling_logp_difference/mean": 0.015700964257121086, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 177.296875, + "completions/mean_terminated_length": 177.296875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.4188961982727051, + "epoch": 0.0625, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.397865288577748, + "kl": 0.0008356262696906924, + "learning_rate": 2.032520325203252e-07, + "loss": -0.0179, + "num_tokens": 1566051.0, + "reward": 0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8231755495071411, + "sampling/importance_sampling_ratio/mean": 1.0001540184020996, + "sampling/importance_sampling_ratio/min": 0.6267947554588318, + "sampling/sampling_logp_difference/max": 0.6005797386169434, + "sampling/sampling_logp_difference/mean": 0.0184800885617733, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 214.90625, + "completions/mean_terminated_length": 214.90625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4530605673789978, + "epoch": 0.06372549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2814797926932124, + "kl": 0.000573880213778466, + "learning_rate": 2.073170731707317e-07, + "loss": 0.0593, + "num_tokens": 1605629.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4754071235656738, + "sampling/importance_sampling_ratio/mean": 0.9998928308486938, + "sampling/importance_sampling_ratio/min": 0.5536072850227356, + "sampling/sampling_logp_difference/max": 0.5912997722625732, + "sampling/sampling_logp_difference/mean": 0.01821010746061802, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 186.6875, + "completions/mean_terminated_length": 186.6875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3925783634185791, + "epoch": 0.06495098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0030464339417726537, + "kl": 0.0006757756927981973, + "learning_rate": 2.1138211382113822e-07, + "loss": 0.0, + "num_tokens": 1639193.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8943507671356201, + "sampling/importance_sampling_ratio/mean": 0.9993544816970825, + "sampling/importance_sampling_ratio/min": 0.6423728466033936, + "sampling/sampling_logp_difference/max": 0.63887619972229, + "sampling/sampling_logp_difference/mean": 0.01692541316151619, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 213.59375, + "completions/mean_terminated_length": 213.59375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4210030734539032, + "epoch": 0.0661764705882353, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5213873136283464, + "kl": 0.000580059364438057, + "learning_rate": 2.154471544715447e-07, + "loss": 0.0368, + "num_tokens": 1668063.0, + "reward": 0.21875, + "reward_std": 0.5539814233779907, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4799457788467407, + "sampling/importance_sampling_ratio/mean": 0.9999606609344482, + "sampling/importance_sampling_ratio/min": 0.4670323133468628, + "sampling/sampling_logp_difference/max": 0.7613568305969238, + "sampling/sampling_logp_difference/mean": 0.016792941838502884, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 192.5, + "completions/mean_terminated_length": 192.5, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.39150482416152954, + "epoch": 0.06740196078431372, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3123382300934188, + "kl": 0.0006600832566618919, + "learning_rate": 2.195121951219512e-07, + "loss": 0.0659, + "num_tokens": 1705231.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5363892316818237, + "sampling/importance_sampling_ratio/mean": 1.0001909732818604, + "sampling/importance_sampling_ratio/min": 0.610666036605835, + "sampling/sampling_logp_difference/max": 0.49320507049560547, + "sampling/sampling_logp_difference/mean": 0.01660531759262085, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 182.3125, + "completions/mean_terminated_length": 182.3125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.31077295541763306, + "epoch": 0.06862745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002448791989654552, + "kl": 0.0006591076962649822, + "learning_rate": 2.235772357723577e-07, + "loss": 0.0, + "num_tokens": 1735811.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.789772629737854, + "sampling/importance_sampling_ratio/mean": 0.9997566938400269, + "sampling/importance_sampling_ratio/min": 0.4267900586128235, + "sampling/sampling_logp_difference/max": 0.8514630794525146, + "sampling/sampling_logp_difference/mean": 0.015317326411604881, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 224.578125, + "completions/mean_terminated_length": 224.578125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3348657488822937, + "epoch": 0.06985294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7502445285201713, + "kl": 0.0005730668781325221, + "learning_rate": 2.2764227642276422e-07, + "loss": 0.0215, + "num_tokens": 1767112.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5211312770843506, + "sampling/importance_sampling_ratio/mean": 1.0004832744598389, + "sampling/importance_sampling_ratio/min": 0.5496712327003479, + "sampling/sampling_logp_difference/max": 0.5984349250793457, + "sampling/sampling_logp_difference/mean": 0.014861776493489742, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 156.359375, + "completions/mean_terminated_length": 156.359375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.3165697455406189, + "epoch": 0.07107843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9152807101002652, + "kl": 0.0006824785377830267, + "learning_rate": 2.3170731707317074e-07, + "loss": 0.0046, + "num_tokens": 1790991.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6023153066635132, + "sampling/importance_sampling_ratio/mean": 0.9994411468505859, + "sampling/importance_sampling_ratio/min": 0.6273468732833862, + "sampling/sampling_logp_difference/max": 0.471449613571167, + "sampling/sampling_logp_difference/mean": 0.015117138624191284, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 193.6875, + "completions/mean_terminated_length": 193.6875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.23887406289577484, + "epoch": 0.07230392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0033072866061355233, + "kl": 0.0006151001434773207, + "learning_rate": 2.3577235772357723e-07, + "loss": 0.0, + "num_tokens": 1817723.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6221637725830078, + "sampling/importance_sampling_ratio/mean": 1.0002944469451904, + "sampling/importance_sampling_ratio/min": 0.6680719256401062, + "sampling/sampling_logp_difference/max": 0.4837608337402344, + "sampling/sampling_logp_difference/mean": 0.01238096784800291, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 189.90625, + "completions/mean_terminated_length": 189.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.36334773898124695, + "epoch": 0.07352941176470588, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1494354820963015, + "kl": 0.0006102257175371051, + "learning_rate": 2.3983739837398373e-07, + "loss": -0.0286, + "num_tokens": 1844341.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.499464750289917, + "sampling/importance_sampling_ratio/mean": 1.0000648498535156, + "sampling/importance_sampling_ratio/min": 0.6169834733009338, + "sampling/sampling_logp_difference/max": 0.4829130172729492, + "sampling/sampling_logp_difference/mean": 0.015710413455963135, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 226.703125, + "completions/mean_terminated_length": 226.703125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2888544797897339, + "epoch": 0.07475490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.002962674547903842, + "kl": 0.0005360324867069721, + "learning_rate": 2.439024390243902e-07, + "loss": 0.0, + "num_tokens": 1878674.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4062519073486328, + "sampling/importance_sampling_ratio/mean": 1.000124454498291, + "sampling/importance_sampling_ratio/min": 0.4441189169883728, + "sampling/sampling_logp_difference/max": 0.8116629123687744, + "sampling/sampling_logp_difference/mean": 0.013115715235471725, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 181.46875, + "completions/mean_terminated_length": 181.46875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.34271660447120667, + "epoch": 0.07598039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005683725741503254, + "kl": 0.0007463833317160606, + "learning_rate": 2.479674796747967e-07, + "loss": 0.0, + "num_tokens": 1907968.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5646599531173706, + "sampling/importance_sampling_ratio/mean": 0.9999079704284668, + "sampling/importance_sampling_ratio/min": 0.3687233030796051, + "sampling/sampling_logp_difference/max": 0.997708797454834, + "sampling/sampling_logp_difference/mean": 0.016180504113435745, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 158.671875, + "completions/mean_terminated_length": 158.671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3199722468852997, + "epoch": 0.07720588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.213606157980981, + "kl": 0.0006946529028937221, + "learning_rate": 2.520325203252032e-07, + "loss": -0.0029, + "num_tokens": 1932379.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5072897672653198, + "sampling/importance_sampling_ratio/mean": 1.0006909370422363, + "sampling/importance_sampling_ratio/min": 0.6118065118789673, + "sampling/sampling_logp_difference/max": 0.49133920669555664, + "sampling/sampling_logp_difference/mean": 0.015027320943772793, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 208.796875, + "completions/mean_terminated_length": 208.796875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.416431188583374, + "epoch": 0.0784313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8846816716558332, + "kl": 0.0005686272052116692, + "learning_rate": 2.5609756097560976e-07, + "loss": 0.0138, + "num_tokens": 1966670.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6296825408935547, + "sampling/importance_sampling_ratio/mean": 1.000244140625, + "sampling/importance_sampling_ratio/min": 0.6546962857246399, + "sampling/sampling_logp_difference/max": 0.4883852005004883, + "sampling/sampling_logp_difference/mean": 0.016528785228729248, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 211.953125, + "completions/mean_terminated_length": 211.953125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.30486810207366943, + "epoch": 0.07965686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005054483798044397, + "kl": 0.0006241274531930685, + "learning_rate": 2.6016260162601625e-07, + "loss": 0.0, + "num_tokens": 1999547.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5277800559997559, + "sampling/importance_sampling_ratio/mean": 0.9995065927505493, + "sampling/importance_sampling_ratio/min": 0.3745501935482025, + "sampling/sampling_logp_difference/max": 0.9820294380187988, + "sampling/sampling_logp_difference/mean": 0.013507379218935966, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 185.734375, + "completions/mean_terminated_length": 185.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4169462323188782, + "epoch": 0.08088235294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5179666940788312, + "kl": 0.0006654216558672488, + "learning_rate": 2.6422764227642274e-07, + "loss": 0.0166, + "num_tokens": 2038266.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8034464120864868, + "sampling/importance_sampling_ratio/mean": 0.9994356632232666, + "sampling/importance_sampling_ratio/min": 0.5457186102867126, + "sampling/sampling_logp_difference/max": 0.60565185546875, + "sampling/sampling_logp_difference/mean": 0.017760932445526123, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 182.28125, + "completions/mean_terminated_length": 182.28125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.36594781279563904, + "epoch": 0.0821078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0225663216333327, + "kl": 0.000694277579896152, + "learning_rate": 2.682926829268293e-07, + "loss": 0.0023, + "num_tokens": 2065916.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.395979404449463, + "sampling/importance_sampling_ratio/mean": 0.9999436736106873, + "sampling/importance_sampling_ratio/min": 0.6172131299972534, + "sampling/sampling_logp_difference/max": 0.4825408458709717, + "sampling/sampling_logp_difference/mean": 0.016853000968694687, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 240.0, + "completions/mean_terminated_length": 240.0, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.39735686779022217, + "epoch": 0.08333333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.924838978895092, + "kl": 0.00048797359340824187, + "learning_rate": 2.7235772357723573e-07, + "loss": 0.0183, + "num_tokens": 2106668.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4861767292022705, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.5961363911628723, + "sampling/sampling_logp_difference/max": 0.5172858238220215, + "sampling/sampling_logp_difference/mean": 0.015098122879862785, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 204.203125, + "completions/mean_terminated_length": 204.203125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.43868178129196167, + "epoch": 0.08455882352941177, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.247102958162562, + "kl": 0.0006902640452608466, + "learning_rate": 2.764227642276423e-07, + "loss": -0.0286, + "num_tokens": 2136217.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4298206567764282, + "sampling/importance_sampling_ratio/mean": 1.0002532005310059, + "sampling/importance_sampling_ratio/min": 0.6179476380348206, + "sampling/sampling_logp_difference/max": 0.4813516139984131, + "sampling/sampling_logp_difference/mean": 0.018437707796692848, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 220.515625, + "completions/mean_terminated_length": 220.515625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4509945511817932, + "epoch": 0.0857843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2776782296849163, + "kl": 0.0005820897640660405, + "learning_rate": 2.8048780487804877e-07, + "loss": 0.0004, + "num_tokens": 2172970.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6365644931793213, + "sampling/importance_sampling_ratio/mean": 0.9998390674591064, + "sampling/importance_sampling_ratio/min": 0.33703523874282837, + "sampling/sampling_logp_difference/max": 1.0875678062438965, + "sampling/sampling_logp_difference/mean": 0.017490090802311897, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 187.453125, + "completions/mean_terminated_length": 187.453125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.28403034806251526, + "epoch": 0.08700980392156862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003153047585758706, + "kl": 0.0006190944695845246, + "learning_rate": 2.8455284552845527e-07, + "loss": 0.0, + "num_tokens": 2200375.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.276409387588501, + "sampling/importance_sampling_ratio/mean": 0.9995650053024292, + "sampling/importance_sampling_ratio/min": 0.6188837289810181, + "sampling/sampling_logp_difference/max": 0.47983789443969727, + "sampling/sampling_logp_difference/mean": 0.013221720233559608, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 249.796875, + "completions/mean_terminated_length": 249.796875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4590471386909485, + "epoch": 0.08823529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2136650583335309, + "kl": 0.0005434445920400321, + "learning_rate": 2.886178861788618e-07, + "loss": 0.0814, + "num_tokens": 2232778.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3725579977035522, + "sampling/importance_sampling_ratio/mean": 1.0002747774124146, + "sampling/importance_sampling_ratio/min": 0.6689460277557373, + "sampling/sampling_logp_difference/max": 0.4020519256591797, + "sampling/sampling_logp_difference/mean": 0.016279827803373337, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 236.3125, + "completions/mean_terminated_length": 236.3125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.41624367237091064, + "epoch": 0.08946078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9831733101434714, + "kl": 0.0006176315946504474, + "learning_rate": 2.9268292682926825e-07, + "loss": -0.0187, + "num_tokens": 2266718.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4574370384216309, + "sampling/importance_sampling_ratio/mean": 1.0001189708709717, + "sampling/importance_sampling_ratio/min": 0.5600042939186096, + "sampling/sampling_logp_difference/max": 0.5798108577728271, + "sampling/sampling_logp_difference/mean": 0.015832317993044853, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 169.28125, + "completions/mean_terminated_length": 169.28125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2937129735946655, + "epoch": 0.09068627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029593835234128824, + "kl": 0.0006617329199798405, + "learning_rate": 2.967479674796748e-07, + "loss": 0.0, + "num_tokens": 2290800.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6446852684020996, + "sampling/importance_sampling_ratio/mean": 0.9997486472129822, + "sampling/importance_sampling_ratio/min": 0.4876939654350281, + "sampling/sampling_logp_difference/max": 0.7180671691894531, + "sampling/sampling_logp_difference/mean": 0.014586403034627438, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 187.734375, + "completions/mean_terminated_length": 187.734375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.39507994055747986, + "epoch": 0.09191176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8095647807776131, + "kl": 0.0006857198313809931, + "learning_rate": 3.008130081300813e-07, + "loss": 0.0074, + "num_tokens": 2320239.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4050134420394897, + "sampling/importance_sampling_ratio/mean": 0.9997504949569702, + "sampling/importance_sampling_ratio/min": 0.6153008937835693, + "sampling/sampling_logp_difference/max": 0.4856438636779785, + "sampling/sampling_logp_difference/mean": 0.017259499058127403, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 170.671875, + "completions/mean_terminated_length": 170.671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.27348703145980835, + "epoch": 0.09313725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004614003351554754, + "kl": 0.0006742796977050602, + "learning_rate": 3.048780487804878e-07, + "loss": 0.0, + "num_tokens": 2345514.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6229311227798462, + "sampling/importance_sampling_ratio/mean": 1.0002658367156982, + "sampling/importance_sampling_ratio/min": 0.6682140231132507, + "sampling/sampling_logp_difference/max": 0.4842338562011719, + "sampling/sampling_logp_difference/mean": 0.013882113620638847, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 213.09375, + "completions/mean_terminated_length": 213.09375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.32055598497390747, + "epoch": 0.09436274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8590981531477943, + "kl": 0.0005941680865362287, + "learning_rate": 3.0894308943089434e-07, + "loss": -0.0238, + "num_tokens": 2376512.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4924719333648682, + "sampling/importance_sampling_ratio/mean": 0.9996081590652466, + "sampling/importance_sampling_ratio/min": 0.6148074865341187, + "sampling/sampling_logp_difference/max": 0.4864461421966553, + "sampling/sampling_logp_difference/mean": 0.014241337776184082, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 194.734375, + "completions/mean_terminated_length": 194.734375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.35685548186302185, + "epoch": 0.09558823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.97051460803959, + "kl": 0.000702782766893506, + "learning_rate": 3.130081300813008e-07, + "loss": 0.0093, + "num_tokens": 2403519.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.546732783317566, + "sampling/importance_sampling_ratio/mean": 0.9990202188491821, + "sampling/importance_sampling_ratio/min": 0.6093260049819946, + "sampling/sampling_logp_difference/max": 0.49540185928344727, + "sampling/sampling_logp_difference/mean": 0.016112372279167175, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 210.171875, + "completions/mean_terminated_length": 210.171875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3866679072380066, + "epoch": 0.09681372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.949837619775768, + "kl": 0.0005758454790338874, + "learning_rate": 3.170731707317073e-07, + "loss": -0.0207, + "num_tokens": 2438938.0, + "reward": 0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5747621059417725, + "sampling/importance_sampling_ratio/mean": 0.9996163249015808, + "sampling/importance_sampling_ratio/min": 0.6254509687423706, + "sampling/sampling_logp_difference/max": 0.4692823886871338, + "sampling/sampling_logp_difference/mean": 0.015203858725726604, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 232.84375, + "completions/mean_terminated_length": 232.84375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4193274974822998, + "epoch": 0.09803921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.350014838850733, + "kl": 0.0006424374878406525, + "learning_rate": 3.211382113821138e-07, + "loss": 0.012, + "num_tokens": 2473376.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.637217402458191, + "sampling/importance_sampling_ratio/mean": 1.000213861465454, + "sampling/importance_sampling_ratio/min": 0.6215099096298218, + "sampling/sampling_logp_difference/max": 0.4929981231689453, + "sampling/sampling_logp_difference/mean": 0.016783427447080612, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 137.75, + "completions/mean_terminated_length": 137.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.33013027906417847, + "epoch": 0.09926470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8962667578457428, + "kl": 0.0007790140807628632, + "learning_rate": 3.252032520325203e-07, + "loss": 0.0044, + "num_tokens": 2500320.0, + "reward": 0.375, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.39943265914917, + "sampling/importance_sampling_ratio/mean": 0.9993496537208557, + "sampling/importance_sampling_ratio/min": 0.3412167429924011, + "sampling/sampling_logp_difference/max": 1.0752373933792114, + "sampling/sampling_logp_difference/mean": 0.016874413937330246, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 182.796875, + "completions/mean_terminated_length": 182.796875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.347734659910202, + "epoch": 0.10049019607843138, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.092648006740298, + "kl": 0.0006662920350208879, + "learning_rate": 3.292682926829268e-07, + "loss": -0.0194, + "num_tokens": 2526803.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6284722089767456, + "sampling/importance_sampling_ratio/mean": 1.0001274347305298, + "sampling/importance_sampling_ratio/min": 0.6815561652183533, + "sampling/sampling_logp_difference/max": 0.4876422882080078, + "sampling/sampling_logp_difference/mean": 0.01490835938602686, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 180.65625, + "completions/mean_terminated_length": 180.65625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3616575598716736, + "epoch": 0.1017156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003146758837083494, + "kl": 0.0007405805517919362, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0, + "num_tokens": 2552509.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6335662603378296, + "sampling/importance_sampling_ratio/mean": 0.9998728036880493, + "sampling/importance_sampling_ratio/min": 0.6259642839431763, + "sampling/sampling_logp_difference/max": 0.4907655715942383, + "sampling/sampling_logp_difference/mean": 0.016518738120794296, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 245.0, + "completions/mean_terminated_length": 245.0, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3743305802345276, + "epoch": 0.10294117647058823, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7807699925989458, + "kl": 0.0005720094195567071, + "learning_rate": 3.3739837398373985e-07, + "loss": 0.0007, + "num_tokens": 2589677.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.576317548751831, + "sampling/importance_sampling_ratio/mean": 1.0004931688308716, + "sampling/importance_sampling_ratio/min": 0.6033816933631897, + "sampling/sampling_logp_difference/max": 0.5052052736282349, + "sampling/sampling_logp_difference/mean": 0.01565365120768547, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 203.96875, + "completions/mean_terminated_length": 203.96875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4013028144836426, + "epoch": 0.10416666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9127288098095632, + "kl": 0.0007912813453003764, + "learning_rate": 3.4146341463414634e-07, + "loss": -0.0118, + "num_tokens": 2624283.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.480612874031067, + "sampling/importance_sampling_ratio/mean": 0.9996029734611511, + "sampling/importance_sampling_ratio/min": 0.6100897789001465, + "sampling/sampling_logp_difference/max": 0.49414920806884766, + "sampling/sampling_logp_difference/mean": 0.017500976100564003, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 162.78125, + "completions/mean_terminated_length": 162.78125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3269077241420746, + "epoch": 0.1053921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0857299767643662, + "kl": 0.0007406205404549837, + "learning_rate": 3.4552845528455284e-07, + "loss": 0.0118, + "num_tokens": 2650253.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.500138521194458, + "sampling/importance_sampling_ratio/mean": 1.000192642211914, + "sampling/importance_sampling_ratio/min": 0.6159108877182007, + "sampling/sampling_logp_difference/max": 0.4846529960632324, + "sampling/sampling_logp_difference/mean": 0.015514223836362362, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 213.9375, + "completions/mean_terminated_length": 213.9375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3853444457054138, + "epoch": 0.10661764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8887578822314383, + "kl": 0.000650618749205023, + "learning_rate": 3.4959349593495933e-07, + "loss": 0.0139, + "num_tokens": 2681929.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6184998750686646, + "sampling/importance_sampling_ratio/mean": 1.0007580518722534, + "sampling/importance_sampling_ratio/min": 0.6482299566268921, + "sampling/sampling_logp_difference/max": 0.48149967193603516, + "sampling/sampling_logp_difference/mean": 0.015420891344547272, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 179.578125, + "completions/mean_terminated_length": 179.578125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.30991730093955994, + "epoch": 0.10784313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0912913059503908, + "kl": 0.00088326825061813, + "learning_rate": 3.536585365853658e-07, + "loss": 0.0072, + "num_tokens": 2708206.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5446949005126953, + "sampling/importance_sampling_ratio/mean": 1.0001922845840454, + "sampling/importance_sampling_ratio/min": 0.662237823009491, + "sampling/sampling_logp_difference/max": 0.4348263740539551, + "sampling/sampling_logp_difference/mean": 0.014573959633708, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 195.625, + "completions/mean_terminated_length": 195.625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2804887294769287, + "epoch": 0.1090686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004476988539064151, + "kl": 0.0007030559936538339, + "learning_rate": 3.5772357723577237e-07, + "loss": 0.0, + "num_tokens": 2739430.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5749434232711792, + "sampling/importance_sampling_ratio/mean": 0.9996755123138428, + "sampling/importance_sampling_ratio/min": 0.5952799916267395, + "sampling/sampling_logp_difference/max": 0.5187234878540039, + "sampling/sampling_logp_difference/mean": 0.012805728241801262, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 185.703125, + "completions/mean_terminated_length": 185.703125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3637750744819641, + "epoch": 0.11029411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0987040629334006, + "kl": 0.0008268561214208603, + "learning_rate": 3.6178861788617886e-07, + "loss": 0.013, + "num_tokens": 2770115.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7029767036437988, + "sampling/importance_sampling_ratio/mean": 0.9999949336051941, + "sampling/importance_sampling_ratio/min": 0.5304797291755676, + "sampling/sampling_logp_difference/max": 0.6339735984802246, + "sampling/sampling_logp_difference/mean": 0.01635168120265007, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 225.59375, + "completions/mean_terminated_length": 225.59375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.39361435174942017, + "epoch": 0.11151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.003172991166124158, + "kl": 0.0006379781407304108, + "learning_rate": 3.6585365853658536e-07, + "loss": 0.0, + "num_tokens": 2803977.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4007288217544556, + "sampling/importance_sampling_ratio/mean": 1.0001379251480103, + "sampling/importance_sampling_ratio/min": 0.6248331665992737, + "sampling/sampling_logp_difference/max": 0.47027063369750977, + "sampling/sampling_logp_difference/mean": 0.015284635126590729, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 163.96875, + "completions/mean_terminated_length": 163.96875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.28181523084640503, + "epoch": 0.11274509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8625871157981916, + "kl": 0.0008367709233425558, + "learning_rate": 3.6991869918699185e-07, + "loss": 0.0095, + "num_tokens": 2827815.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.348543643951416, + "sampling/importance_sampling_ratio/mean": 1.000134825706482, + "sampling/importance_sampling_ratio/min": 0.6121360063552856, + "sampling/sampling_logp_difference/max": 0.4908008575439453, + "sampling/sampling_logp_difference/mean": 0.01428581029176712, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 247.796875, + "completions/mean_terminated_length": 247.796875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3397085666656494, + "epoch": 0.11397058823529412, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8270845133939637, + "kl": 0.0007522313389927149, + "learning_rate": 3.7398373983739835e-07, + "loss": -0.0304, + "num_tokens": 2862634.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5790555477142334, + "sampling/importance_sampling_ratio/mean": 1.0003384351730347, + "sampling/importance_sampling_ratio/min": 0.6127174496650696, + "sampling/sampling_logp_difference/max": 0.4898514747619629, + "sampling/sampling_logp_difference/mean": 0.014473985880613327, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 153.296875, + "completions/mean_terminated_length": 153.296875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3286474049091339, + "epoch": 0.11519607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1208988814368832, + "kl": 0.0012791944900527596, + "learning_rate": 3.7804878048780484e-07, + "loss": 0.0089, + "num_tokens": 2886029.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.527778148651123, + "sampling/importance_sampling_ratio/mean": 0.9993230104446411, + "sampling/importance_sampling_ratio/min": 0.6139498353004456, + "sampling/sampling_logp_difference/max": 0.4878420829772949, + "sampling/sampling_logp_difference/mean": 0.016680724918842316, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 255.171875, + "completions/mean_terminated_length": 255.171875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.35204946994781494, + "epoch": 0.11642156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1527361378746388, + "kl": 0.0008182760793715715, + "learning_rate": 3.821138211382114e-07, + "loss": 0.0161, + "num_tokens": 2921352.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5822359323501587, + "sampling/importance_sampling_ratio/mean": 1.0001647472381592, + "sampling/importance_sampling_ratio/min": 0.5632071495056152, + "sampling/sampling_logp_difference/max": 0.5741077065467834, + "sampling/sampling_logp_difference/mean": 0.014608575962483883, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 181.390625, + "completions/mean_terminated_length": 181.390625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.37239915132522583, + "epoch": 0.11764705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005972359418865025, + "kl": 0.0008802631637081504, + "learning_rate": 3.861788617886179e-07, + "loss": 0.0, + "num_tokens": 2953889.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6360015869140625, + "sampling/importance_sampling_ratio/mean": 1.0000799894332886, + "sampling/importance_sampling_ratio/min": 0.3959225118160248, + "sampling/sampling_logp_difference/max": 0.9265367984771729, + "sampling/sampling_logp_difference/mean": 0.016587747260928154, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 188.203125, + "completions/mean_terminated_length": 188.203125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.34458205103874207, + "epoch": 0.11887254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005579135570687071, + "kl": 0.0009129910613410175, + "learning_rate": 3.902439024390244e-07, + "loss": 0.0, + "num_tokens": 2982830.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4990748167037964, + "sampling/importance_sampling_ratio/mean": 1.0001230239868164, + "sampling/importance_sampling_ratio/min": 0.637395441532135, + "sampling/sampling_logp_difference/max": 0.4503650665283203, + "sampling/sampling_logp_difference/mean": 0.01516056526452303, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 176.8125, + "completions/mean_terminated_length": 176.8125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.39370280504226685, + "epoch": 0.12009803921568628, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3861097080427915, + "kl": 0.0012123179621994495, + "learning_rate": 3.9430894308943087e-07, + "loss": -0.0181, + "num_tokens": 3012642.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6179218292236328, + "sampling/importance_sampling_ratio/mean": 1.000260829925537, + "sampling/importance_sampling_ratio/min": 0.7118787169456482, + "sampling/sampling_logp_difference/max": 0.481142520904541, + "sampling/sampling_logp_difference/mean": 0.017914410680532455, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 183.265625, + "completions/mean_terminated_length": 183.265625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.40355098247528076, + "epoch": 0.1213235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0670426367198722, + "kl": 0.0011470717145130038, + "learning_rate": 3.9837398373983736e-07, + "loss": 0.0129, + "num_tokens": 3041123.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.545743703842163, + "sampling/importance_sampling_ratio/mean": 1.0005805492401123, + "sampling/importance_sampling_ratio/min": 0.6606181859970093, + "sampling/sampling_logp_difference/max": 0.4355051517486572, + "sampling/sampling_logp_difference/mean": 0.018123583868145943, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 246.734375, + "completions/mean_terminated_length": 246.734375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3937031626701355, + "epoch": 0.12254901960784313, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8196707546070746, + "kl": 0.0010341871529817581, + "learning_rate": 4.024390243902439e-07, + "loss": 0.0152, + "num_tokens": 3076018.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5291866064071655, + "sampling/importance_sampling_ratio/mean": 1.0004017353057861, + "sampling/importance_sampling_ratio/min": 0.6395527720451355, + "sampling/sampling_logp_difference/max": 0.44698619842529297, + "sampling/sampling_logp_difference/mean": 0.01549257431179285, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 239.59375, + "completions/mean_terminated_length": 239.59375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4018251299858093, + "epoch": 0.12377450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7982382817251258, + "kl": 0.0009367854800075293, + "learning_rate": 4.065040650406504e-07, + "loss": 0.0088, + "num_tokens": 3110232.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.650946021080017, + "sampling/importance_sampling_ratio/mean": 0.9998608827590942, + "sampling/importance_sampling_ratio/min": 0.6269895434379578, + "sampling/sampling_logp_difference/max": 0.5013484954833984, + "sampling/sampling_logp_difference/mean": 0.016505785286426544, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 204.421875, + "completions/mean_terminated_length": 204.421875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.32230591773986816, + "epoch": 0.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00750480542790826, + "kl": 0.0011915290961042047, + "learning_rate": 4.105691056910569e-07, + "loss": 0.0, + "num_tokens": 3142403.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5774788856506348, + "sampling/importance_sampling_ratio/mean": 0.9997325539588928, + "sampling/importance_sampling_ratio/min": 0.644951581954956, + "sampling/sampling_logp_difference/max": 0.4558279514312744, + "sampling/sampling_logp_difference/mean": 0.015833374112844467, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 167.09375, + "completions/mean_terminated_length": 167.09375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.32066941261291504, + "epoch": 0.12622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008640263493909388, + "kl": 0.001314917579293251, + "learning_rate": 4.146341463414634e-07, + "loss": 0.0, + "num_tokens": 3173497.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4054081439971924, + "sampling/importance_sampling_ratio/mean": 1.0004949569702148, + "sampling/importance_sampling_ratio/min": 0.6147712469100952, + "sampling/sampling_logp_difference/max": 0.48650503158569336, + "sampling/sampling_logp_difference/mean": 0.015146872028708458, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 200.3125, + "completions/mean_terminated_length": 200.3125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.38391387462615967, + "epoch": 0.12745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008062843336245079, + "kl": 0.0012803412973880768, + "learning_rate": 4.186991869918699e-07, + "loss": 0.0, + "num_tokens": 3204989.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3557255268096924, + "sampling/importance_sampling_ratio/mean": 1.000128984451294, + "sampling/importance_sampling_ratio/min": 0.6949819922447205, + "sampling/sampling_logp_difference/max": 0.363869309425354, + "sampling/sampling_logp_difference/mean": 0.0171358622610569, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 213.6875, + "completions/mean_terminated_length": 213.6875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3612138032913208, + "epoch": 0.12867647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007391311454523405, + "kl": 0.0011714284773916006, + "learning_rate": 4.2276422764227643e-07, + "loss": 0.0, + "num_tokens": 3238041.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6307578086853027, + "sampling/importance_sampling_ratio/mean": 1.0004353523254395, + "sampling/importance_sampling_ratio/min": 0.6108908653259277, + "sampling/sampling_logp_difference/max": 0.49283695220947266, + "sampling/sampling_logp_difference/mean": 0.016306662932038307, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 189.0625, + "completions/mean_terminated_length": 189.0625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3218452036380768, + "epoch": 0.12990196078431374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010263366656990456, + "kl": 0.0015877524856477976, + "learning_rate": 4.268292682926829e-07, + "loss": 0.0, + "num_tokens": 3265597.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5277644395828247, + "sampling/importance_sampling_ratio/mean": 0.9993202090263367, + "sampling/importance_sampling_ratio/min": 0.5364041328430176, + "sampling/sampling_logp_difference/max": 0.6228674650192261, + "sampling/sampling_logp_difference/mean": 0.016494762152433395, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 197.6875, + "completions/mean_terminated_length": 197.6875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2655947804450989, + "epoch": 0.13112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006369894213665063, + "kl": 0.0008513329667039216, + "learning_rate": 4.308943089430894e-07, + "loss": 0.0, + "num_tokens": 3303545.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4051260948181152, + "sampling/importance_sampling_ratio/mean": 0.9999285936355591, + "sampling/importance_sampling_ratio/min": 0.6263183951377869, + "sampling/sampling_logp_difference/max": 0.4678964614868164, + "sampling/sampling_logp_difference/mean": 0.012676535174250603, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 191.03125, + "completions/mean_terminated_length": 191.03125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.34833234548568726, + "epoch": 0.1323529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9575857032441931, + "kl": 0.0016228670720010996, + "learning_rate": 4.349593495934959e-07, + "loss": -0.0078, + "num_tokens": 3341883.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5313594341278076, + "sampling/importance_sampling_ratio/mean": 1.000009298324585, + "sampling/importance_sampling_ratio/min": 0.4878419041633606, + "sampling/sampling_logp_difference/max": 0.7177639007568359, + "sampling/sampling_logp_difference/mean": 0.01729895919561386, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 257.171875, + "completions/mean_terminated_length": 257.171875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.41857966780662537, + "epoch": 0.13357843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8022842118823669, + "kl": 0.0011155225802212954, + "learning_rate": 4.390243902439024e-07, + "loss": 0.0205, + "num_tokens": 3382614.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6637526750564575, + "sampling/importance_sampling_ratio/mean": 0.9996939897537231, + "sampling/importance_sampling_ratio/min": 0.62088942527771, + "sampling/sampling_logp_difference/max": 0.5090756416320801, + "sampling/sampling_logp_difference/mean": 0.017605653032660484, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 195.328125, + "completions/mean_terminated_length": 195.328125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3044811189174652, + "epoch": 0.13480392156862744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00782186742722866, + "kl": 0.0011501931585371494, + "learning_rate": 4.4308943089430896e-07, + "loss": 0.0, + "num_tokens": 3422267.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6345769166946411, + "sampling/importance_sampling_ratio/mean": 0.9995689988136292, + "sampling/importance_sampling_ratio/min": 0.595684289932251, + "sampling/sampling_logp_difference/max": 0.5180444717407227, + "sampling/sampling_logp_difference/mean": 0.014964728616178036, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 192.390625, + "completions/mean_terminated_length": 192.390625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.2591615915298462, + "epoch": 0.13602941176470587, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008764177979505195, + "kl": 0.001395724480971694, + "learning_rate": 4.471544715447154e-07, + "loss": 0.0, + "num_tokens": 3449172.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.620025634765625, + "sampling/importance_sampling_ratio/mean": 1.0004347562789917, + "sampling/importance_sampling_ratio/min": 0.6094062924385071, + "sampling/sampling_logp_difference/max": 0.4952700138092041, + "sampling/sampling_logp_difference/mean": 0.01283632405102253, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 216.203125, + "completions/mean_terminated_length": 216.203125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3501753509044647, + "epoch": 0.13725490196078433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007326647429011458, + "kl": 0.0012353763449937105, + "learning_rate": 4.5121951219512194e-07, + "loss": 0.0, + "num_tokens": 3479073.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6356961727142334, + "sampling/importance_sampling_ratio/mean": 1.0002961158752441, + "sampling/importance_sampling_ratio/min": 0.6576474905014038, + "sampling/sampling_logp_difference/max": 0.4920685291290283, + "sampling/sampling_logp_difference/mean": 0.014653654769062996, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 249.59375, + "completions/mean_terminated_length": 249.59375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.27881038188934326, + "epoch": 0.13848039215686275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0063514451005622094, + "kl": 0.0009215730242431164, + "learning_rate": 4.5528455284552844e-07, + "loss": 0.0, + "num_tokens": 3516983.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.620772361755371, + "sampling/importance_sampling_ratio/mean": 0.9997947812080383, + "sampling/importance_sampling_ratio/min": 0.6486902832984924, + "sampling/sampling_logp_difference/max": 0.48290276527404785, + "sampling/sampling_logp_difference/mean": 0.012355177663266659, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 174.09375, + "completions/mean_terminated_length": 174.09375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3457588851451874, + "epoch": 0.13970588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9080781021638189, + "kl": 0.0013797450810670853, + "learning_rate": 4.5934959349593493e-07, + "loss": 0.0197, + "num_tokens": 3545421.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.406604528427124, + "sampling/importance_sampling_ratio/mean": 1.000349760055542, + "sampling/importance_sampling_ratio/min": 0.662269115447998, + "sampling/sampling_logp_difference/max": 0.4120832681655884, + "sampling/sampling_logp_difference/mean": 0.015144167467951775, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 160.734375, + "completions/mean_terminated_length": 160.734375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.22996263206005096, + "epoch": 0.1409313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010243953254923614, + "kl": 0.0013740160502493382, + "learning_rate": 4.634146341463415e-07, + "loss": 0.0, + "num_tokens": 3571180.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5758028030395508, + "sampling/importance_sampling_ratio/mean": 0.9999841451644897, + "sampling/importance_sampling_ratio/min": 0.6630354523658752, + "sampling/sampling_logp_difference/max": 0.45476484298706055, + "sampling/sampling_logp_difference/mean": 0.012498574331402779, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 196.6875, + "completions/mean_terminated_length": 196.6875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.33914482593536377, + "epoch": 0.14215686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008385369656932104, + "kl": 0.001285051228478551, + "learning_rate": 4.674796747967479e-07, + "loss": 0.0, + "num_tokens": 3606984.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5045294761657715, + "sampling/importance_sampling_ratio/mean": 1.0004189014434814, + "sampling/importance_sampling_ratio/min": 0.6098470687866211, + "sampling/sampling_logp_difference/max": 0.49454712867736816, + "sampling/sampling_logp_difference/mean": 0.015076465904712677, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 229.140625, + "completions/mean_terminated_length": 229.140625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.30870968103408813, + "epoch": 0.14338235294117646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7474120105497813, + "kl": 0.0013130044098943472, + "learning_rate": 4.7154471544715447e-07, + "loss": 0.0243, + "num_tokens": 3640401.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000335454940796, + "sampling/importance_sampling_ratio/min": 0.6410284638404846, + "sampling/sampling_logp_difference/max": 0.7243653535842896, + "sampling/sampling_logp_difference/mean": 0.013415869325399399, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 245.890625, + "completions/mean_terminated_length": 245.890625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.28496605157852173, + "epoch": 0.14460784313725492, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7489191941362603, + "kl": 0.0012819442199543118, + "learning_rate": 4.756097560975609e-07, + "loss": 0.0032, + "num_tokens": 3675018.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5749266147613525, + "sampling/importance_sampling_ratio/mean": 0.9996001720428467, + "sampling/importance_sampling_ratio/min": 0.6194509267807007, + "sampling/sampling_logp_difference/max": 0.47892189025878906, + "sampling/sampling_logp_difference/mean": 0.01227761059999466, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 210.203125, + "completions/mean_terminated_length": 210.203125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.41157370805740356, + "epoch": 0.14583333333333334, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2819404259096743, + "kl": 0.0021230392158031464, + "learning_rate": 4.796747967479675e-07, + "loss": 0.0261, + "num_tokens": 3702663.0, + "reward": -0.21875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.448575496673584, + "sampling/importance_sampling_ratio/mean": 0.9992985725402832, + "sampling/importance_sampling_ratio/min": 0.5025063157081604, + "sampling/sampling_logp_difference/max": 0.6881470680236816, + "sampling/sampling_logp_difference/mean": 0.017737586051225662, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 198.546875, + "completions/mean_terminated_length": 198.546875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3729953169822693, + "epoch": 0.14705882352941177, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3678292203558666, + "kl": 0.002271226141601801, + "learning_rate": 4.83739837398374e-07, + "loss": -0.031, + "num_tokens": 3730858.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5564074516296387, + "sampling/importance_sampling_ratio/mean": 1.000110149383545, + "sampling/importance_sampling_ratio/min": 0.6549456715583801, + "sampling/sampling_logp_difference/max": 0.4423801898956299, + "sampling/sampling_logp_difference/mean": 0.01663437858223915, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 218.671875, + "completions/mean_terminated_length": 218.671875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.42304205894470215, + "epoch": 0.1482843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9284858253703636, + "kl": 0.0015721892705187201, + "learning_rate": 4.878048780487804e-07, + "loss": 0.0088, + "num_tokens": 3768037.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4394360780715942, + "sampling/importance_sampling_ratio/mean": 1.0001715421676636, + "sampling/importance_sampling_ratio/min": 0.6298869848251343, + "sampling/sampling_logp_difference/max": 0.46221494674682617, + "sampling/sampling_logp_difference/mean": 0.018140073865652084, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 214.03125, + "completions/mean_terminated_length": 214.03125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4136981666088104, + "epoch": 0.14950980392156862, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8067982302135566, + "kl": 0.001777131692506373, + "learning_rate": 4.91869918699187e-07, + "loss": -0.0021, + "num_tokens": 3801895.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.8919440507888794, + "sampling/importance_sampling_ratio/mean": 0.9996967911720276, + "sampling/importance_sampling_ratio/min": 0.3235868215560913, + "sampling/sampling_logp_difference/max": 1.1282877922058105, + "sampling/sampling_logp_difference/mean": 0.018149856477975845, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 164.8125, + "completions/mean_terminated_length": 164.8125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.35000666975975037, + "epoch": 0.15073529411764705, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9843246113783801, + "kl": 0.0021282376255840063, + "learning_rate": 4.959349593495934e-07, + "loss": -0.0143, + "num_tokens": 3826283.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4718385934829712, + "sampling/importance_sampling_ratio/mean": 1.0010018348693848, + "sampling/importance_sampling_ratio/min": 0.655526876449585, + "sampling/sampling_logp_difference/max": 0.42231595516204834, + "sampling/sampling_logp_difference/mean": 0.01719508320093155, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 218.109375, + "completions/mean_terminated_length": 218.109375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.25339066982269287, + "epoch": 0.15196078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008079150541077773, + "kl": 0.0012523139594122767, + "learning_rate": 5e-07, + "loss": 0.0, + "num_tokens": 3868402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4768990278244019, + "sampling/importance_sampling_ratio/mean": 1.0000702142715454, + "sampling/importance_sampling_ratio/min": 0.6396217942237854, + "sampling/sampling_logp_difference/max": 0.4468783140182495, + "sampling/sampling_logp_difference/mean": 0.012863853946328163, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 173.890625, + "completions/mean_terminated_length": 173.890625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3082340657711029, + "epoch": 0.15318627450980393, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0185918635434672, + "kl": 0.0016419864259660244, + "learning_rate": 5.040650406504064e-07, + "loss": -0.007, + "num_tokens": 3898715.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6783421039581299, + "sampling/importance_sampling_ratio/mean": 1.0000501871109009, + "sampling/importance_sampling_ratio/min": 0.19109542667865753, + "sampling/sampling_logp_difference/max": 1.654982328414917, + "sampling/sampling_logp_difference/mean": 0.014716839417815208, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 186.453125, + "completions/mean_terminated_length": 186.453125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3567272424697876, + "epoch": 0.15441176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3837709451569948, + "kl": 0.0018109779339283705, + "learning_rate": 5.081300813008131e-07, + "loss": 0.0437, + "num_tokens": 3927912.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7580591440200806, + "sampling/importance_sampling_ratio/mean": 1.000162959098816, + "sampling/importance_sampling_ratio/min": 0.6622379422187805, + "sampling/sampling_logp_difference/max": 0.5642104148864746, + "sampling/sampling_logp_difference/mean": 0.014668013900518417, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 166.78125, + "completions/mean_terminated_length": 166.78125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.30008426308631897, + "epoch": 0.1556372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014836222650616791, + "kl": 0.0022569410502910614, + "learning_rate": 5.121951219512195e-07, + "loss": 0.0, + "num_tokens": 3954170.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 1.000068187713623, + "sampling/importance_sampling_ratio/min": 0.613304078578949, + "sampling/sampling_logp_difference/max": 0.4888944625854492, + "sampling/sampling_logp_difference/mean": 0.014907223172485828, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 157.515625, + "completions/mean_terminated_length": 157.515625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.32169437408447266, + "epoch": 0.1568627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014807923597855412, + "kl": 0.002098316326737404, + "learning_rate": 5.16260162601626e-07, + "loss": 0.0, + "num_tokens": 3980747.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5191376209259033, + "sampling/importance_sampling_ratio/mean": 0.9997336864471436, + "sampling/importance_sampling_ratio/min": 0.6103330254554749, + "sampling/sampling_logp_difference/max": 0.49375057220458984, + "sampling/sampling_logp_difference/mean": 0.016123417764902115, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 271.0625, + "completions/mean_terminated_length": 271.0625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.29940271377563477, + "epoch": 0.15808823529411764, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0916190885188615, + "kl": 0.001945118885487318, + "learning_rate": 5.203252032520325e-07, + "loss": 0.04, + "num_tokens": 4016639.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4510223865509033, + "sampling/importance_sampling_ratio/mean": 0.9998721480369568, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.014073947444558144, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 195.046875, + "completions/mean_terminated_length": 195.046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.30529260635375977, + "epoch": 0.15931372549019607, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8566925567928513, + "kl": 0.00208301586098969, + "learning_rate": 5.24390243902439e-07, + "loss": 0.0082, + "num_tokens": 4045778.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008527040481567, + "sampling/importance_sampling_ratio/min": 0.6134491562843323, + "sampling/sampling_logp_difference/max": 0.698920488357544, + "sampling/sampling_logp_difference/mean": 0.013193566352128983, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 148.796875, + "completions/mean_terminated_length": 148.796875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.28194862604141235, + "epoch": 0.16053921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01877514514420134, + "kl": 0.002552943304181099, + "learning_rate": 5.284552845528455e-07, + "loss": 0.0, + "num_tokens": 4073733.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.578508973121643, + "sampling/importance_sampling_ratio/mean": 1.0002617835998535, + "sampling/importance_sampling_ratio/min": 0.3664003014564514, + "sampling/sampling_logp_difference/max": 1.0040287971496582, + "sampling/sampling_logp_difference/mean": 0.014787127263844013, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 154.109375, + "completions/mean_terminated_length": 154.109375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2850548326969147, + "epoch": 0.16176470588235295, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1842928599356437, + "kl": 0.0039580995216965675, + "learning_rate": 5.325203252032519e-07, + "loss": -0.0018, + "num_tokens": 4100572.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4497106075286865, + "sampling/importance_sampling_ratio/mean": 0.9997336864471436, + "sampling/importance_sampling_ratio/min": 0.6338287591934204, + "sampling/sampling_logp_difference/max": 0.4559764862060547, + "sampling/sampling_logp_difference/mean": 0.015777725726366043, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 146.546875, + "completions/mean_terminated_length": 146.546875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.35511481761932373, + "epoch": 0.16299019607843138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024482506909237974, + "kl": 0.0033495896495878696, + "learning_rate": 5.365853658536586e-07, + "loss": 0.0, + "num_tokens": 4129567.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6090610027313232, + "sampling/importance_sampling_ratio/mean": 1.0000367164611816, + "sampling/importance_sampling_ratio/min": 0.6313496828079224, + "sampling/sampling_logp_difference/max": 0.4756507873535156, + "sampling/sampling_logp_difference/mean": 0.0187184177339077, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 178.3125, + "completions/mean_terminated_length": 178.3125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.34629660844802856, + "epoch": 0.1642156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022087950599703535, + "kl": 0.002930707298219204, + "learning_rate": 5.40650406504065e-07, + "loss": 0.0, + "num_tokens": 4157059.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.615315556526184, + "sampling/importance_sampling_ratio/mean": 0.9998091459274292, + "sampling/importance_sampling_ratio/min": 0.6301627159118652, + "sampling/sampling_logp_difference/max": 0.47953033447265625, + "sampling/sampling_logp_difference/mean": 0.015880992636084557, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 191.1875, + "completions/mean_terminated_length": 191.1875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3255227208137512, + "epoch": 0.16544117647058823, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.116328162716102, + "kl": 0.002989242784678936, + "learning_rate": 5.447154471544715e-07, + "loss": -0.0028, + "num_tokens": 4191519.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6071367263793945, + "sampling/importance_sampling_ratio/mean": 1.0001487731933594, + "sampling/importance_sampling_ratio/min": 0.6103999614715576, + "sampling/sampling_logp_difference/max": 0.4936408996582031, + "sampling/sampling_logp_difference/mean": 0.015889611095190048, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 203.546875, + "completions/mean_terminated_length": 203.546875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3820006251335144, + "epoch": 0.16666666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02492721865244265, + "kl": 0.0034382841549813747, + "learning_rate": 5.487804878048781e-07, + "loss": 0.0, + "num_tokens": 4226802.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8545461893081665, + "sampling/importance_sampling_ratio/mean": 1.000016212463379, + "sampling/importance_sampling_ratio/min": 0.6229522228240967, + "sampling/sampling_logp_difference/max": 0.6176400184631348, + "sampling/sampling_logp_difference/mean": 0.017170652747154236, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.28024011850357056, + "epoch": 0.16789215686274508, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019821679154392032, + "kl": 0.002866612747311592, + "learning_rate": 5.528455284552846e-07, + "loss": 0.0, + "num_tokens": 4255186.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.581290364265442, + "sampling/importance_sampling_ratio/mean": 1.0009236335754395, + "sampling/importance_sampling_ratio/min": 0.6172630190849304, + "sampling/sampling_logp_difference/max": 0.48246002197265625, + "sampling/sampling_logp_difference/mean": 0.013574345037341118, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 206.515625, + "completions/mean_terminated_length": 206.515625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3306284546852112, + "epoch": 0.16911764705882354, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.022713078027099, + "kl": 0.003378923051059246, + "learning_rate": 5.56910569105691e-07, + "loss": -0.0052, + "num_tokens": 4285667.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4726227521896362, + "sampling/importance_sampling_ratio/mean": 1.0002403259277344, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.4826626777648926, + "sampling/sampling_logp_difference/mean": 0.0152666587382555, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 199.21875, + "completions/mean_terminated_length": 199.21875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3423955738544464, + "epoch": 0.17034313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9941180025099868, + "kl": 0.004970120266079903, + "learning_rate": 5.609756097560975e-07, + "loss": 0.0357, + "num_tokens": 4315665.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4934946298599243, + "sampling/importance_sampling_ratio/mean": 1.0001451969146729, + "sampling/importance_sampling_ratio/min": 0.3357149362564087, + "sampling/sampling_logp_difference/max": 1.0914928913116455, + "sampling/sampling_logp_difference/mean": 0.01607022061944008, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 172.5625, + "completions/mean_terminated_length": 172.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.31557244062423706, + "epoch": 0.1715686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0323187840939241, + "kl": 0.005174246151000261, + "learning_rate": 5.650406504065041e-07, + "loss": 0.0, + "num_tokens": 4346517.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.621016263961792, + "sampling/importance_sampling_ratio/mean": 0.9998432993888855, + "sampling/importance_sampling_ratio/min": 0.6001259088516235, + "sampling/sampling_logp_difference/max": 0.5106158256530762, + "sampling/sampling_logp_difference/mean": 0.01690821535885334, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 164.578125, + "completions/mean_terminated_length": 164.578125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.23700770735740662, + "epoch": 0.17279411764705882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02986836784298849, + "kl": 0.004557392559945583, + "learning_rate": 5.691056910569105e-07, + "loss": 0.0, + "num_tokens": 4372026.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5181050300598145, + "sampling/importance_sampling_ratio/mean": 0.9995629787445068, + "sampling/importance_sampling_ratio/min": 0.6191670894622803, + "sampling/sampling_logp_difference/max": 0.47938013076782227, + "sampling/sampling_logp_difference/mean": 0.013194214552640915, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 184.375, + "completions/mean_terminated_length": 184.375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3119884729385376, + "epoch": 0.17401960784313725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02426669659002909, + "kl": 0.004346243105828762, + "learning_rate": 5.73170731707317e-07, + "loss": 0.0, + "num_tokens": 4401538.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6281784772872925, + "sampling/importance_sampling_ratio/mean": 1.0002410411834717, + "sampling/importance_sampling_ratio/min": 0.639254093170166, + "sampling/sampling_logp_difference/max": 0.4874619245529175, + "sampling/sampling_logp_difference/mean": 0.01620330847799778, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 173.875, + "completions/mean_terminated_length": 173.875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2875828742980957, + "epoch": 0.17524509803921567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028400871307893368, + "kl": 0.004842736292630434, + "learning_rate": 5.772357723577236e-07, + "loss": 0.0, + "num_tokens": 4427338.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5251868963241577, + "sampling/importance_sampling_ratio/mean": 1.0000712871551514, + "sampling/importance_sampling_ratio/min": 0.680833637714386, + "sampling/sampling_logp_difference/max": 0.4221169948577881, + "sampling/sampling_logp_difference/mean": 0.015635253861546516, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 185.21875, + "completions/mean_terminated_length": 185.21875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.32663196325302124, + "epoch": 0.17647058823529413, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3266612292106221, + "kl": 0.005485333502292633, + "learning_rate": 5.813008130081301e-07, + "loss": 0.0223, + "num_tokens": 4466200.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5157999992370605, + "sampling/importance_sampling_ratio/mean": 0.9999676942825317, + "sampling/importance_sampling_ratio/min": 0.41252991557121277, + "sampling/sampling_logp_difference/max": 0.8854465484619141, + "sampling/sampling_logp_difference/mean": 0.01599922403693199, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 161.515625, + "completions/mean_terminated_length": 161.515625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3241770267486572, + "epoch": 0.17769607843137256, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0026817610712315, + "kl": 0.005243861116468906, + "learning_rate": 5.853658536585365e-07, + "loss": -0.005, + "num_tokens": 4493977.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.60780668258667, + "sampling/importance_sampling_ratio/mean": 1.0003927946090698, + "sampling/importance_sampling_ratio/min": 0.6585583686828613, + "sampling/sampling_logp_difference/max": 0.4748709201812744, + "sampling/sampling_logp_difference/mean": 0.015489340759813786, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 218.15625, + "completions/mean_terminated_length": 218.15625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.27188339829444885, + "epoch": 0.17892156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037233272570007664, + "kl": 0.005310396663844585, + "learning_rate": 5.894308943089431e-07, + "loss": 0.0001, + "num_tokens": 4530899.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5263862609863281, + "sampling/importance_sampling_ratio/mean": 1.00016188621521, + "sampling/importance_sampling_ratio/min": 0.6371117830276489, + "sampling/sampling_logp_difference/max": 0.45081019401550293, + "sampling/sampling_logp_difference/mean": 0.013202058151364326, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 199.078125, + "completions/mean_terminated_length": 199.078125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3040648102760315, + "epoch": 0.1801470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0245297915998763, + "kl": 0.004978617187589407, + "learning_rate": 5.934959349593496e-07, + "loss": -0.0188, + "num_tokens": 4559208.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.7152482271194458, + "sampling/importance_sampling_ratio/mean": 1.0001599788665771, + "sampling/importance_sampling_ratio/min": 0.6121216416358948, + "sampling/sampling_logp_difference/max": 0.5395578145980835, + "sampling/sampling_logp_difference/mean": 0.015116818249225616, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3684394359588623, + "epoch": 0.18137254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.020667760502508, + "kl": 0.004764545243233442, + "learning_rate": 5.97560975609756e-07, + "loss": -0.0077, + "num_tokens": 4590242.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.537596344947815, + "sampling/importance_sampling_ratio/mean": 0.9995277523994446, + "sampling/importance_sampling_ratio/min": 0.6257607340812683, + "sampling/sampling_logp_difference/max": 0.46878719329833984, + "sampling/sampling_logp_difference/mean": 0.016859525814652443, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 188.484375, + "completions/mean_terminated_length": 188.484375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.292277991771698, + "epoch": 0.18259803921568626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03224063838537423, + "kl": 0.00572782289236784, + "learning_rate": 6.016260162601626e-07, + "loss": 0.0001, + "num_tokens": 4622033.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3439899682998657, + "sampling/importance_sampling_ratio/mean": 0.9993945360183716, + "sampling/importance_sampling_ratio/min": 0.6064043045043945, + "sampling/sampling_logp_difference/max": 0.5002083778381348, + "sampling/sampling_logp_difference/mean": 0.015040220692753792, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 156.171875, + "completions/mean_terminated_length": 156.171875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.27697500586509705, + "epoch": 0.18382352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1502898286460446, + "kl": 0.0055986312218010426, + "learning_rate": 6.056910569105691e-07, + "loss": -0.0426, + "num_tokens": 4646828.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00044584274292, + "sampling/importance_sampling_ratio/min": 0.6170914769172668, + "sampling/sampling_logp_difference/max": 0.7287707328796387, + "sampling/sampling_logp_difference/mean": 0.01559227705001831, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 204.25, + "completions/mean_terminated_length": 204.25, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.42049098014831543, + "epoch": 0.18504901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.362654332945133, + "kl": 0.005915883928537369, + "learning_rate": 6.097560975609756e-07, + "loss": -0.0603, + "num_tokens": 4682300.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5822070837020874, + "sampling/importance_sampling_ratio/mean": 0.9996191263198853, + "sampling/importance_sampling_ratio/min": 0.6361913084983826, + "sampling/sampling_logp_difference/max": 0.45882081985473633, + "sampling/sampling_logp_difference/mean": 0.016376720741391182, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 167.734375, + "completions/mean_terminated_length": 167.734375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3233358561992645, + "epoch": 0.18627450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029677670044690574, + "kl": 0.0062215314246714115, + "learning_rate": 6.13821138211382e-07, + "loss": 0.0001, + "num_tokens": 4713531.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5205823183059692, + "sampling/importance_sampling_ratio/mean": 0.9996426701545715, + "sampling/importance_sampling_ratio/min": 0.6058167219161987, + "sampling/sampling_logp_difference/max": 0.5011777877807617, + "sampling/sampling_logp_difference/mean": 0.01610369235277176, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 181.609375, + "completions/mean_terminated_length": 181.609375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.26142436265945435, + "epoch": 0.1875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021068254107589144, + "kl": 0.004076074808835983, + "learning_rate": 6.178861788617887e-07, + "loss": 0.0, + "num_tokens": 4741794.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4891793727874756, + "sampling/importance_sampling_ratio/mean": 0.9995283484458923, + "sampling/importance_sampling_ratio/min": 0.6404410004615784, + "sampling/sampling_logp_difference/max": 0.4455982446670532, + "sampling/sampling_logp_difference/mean": 0.012542849406599998, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 240.890625, + "completions/mean_terminated_length": 240.890625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.32881855964660645, + "epoch": 0.18872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01738556994587932, + "kl": 0.003266222309321165, + "learning_rate": 6.219512195121951e-07, + "loss": 0.0, + "num_tokens": 4776843.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5459238290786743, + "sampling/importance_sampling_ratio/mean": 0.9998267889022827, + "sampling/importance_sampling_ratio/min": 0.1480797976255417, + "sampling/sampling_logp_difference/max": 1.9100040197372437, + "sampling/sampling_logp_difference/mean": 0.014897173270583153, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 178.6875, + "completions/mean_terminated_length": 178.6875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3043786287307739, + "epoch": 0.18995098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.21318191034155, + "kl": 0.0040686968713998795, + "learning_rate": 6.260162601626016e-07, + "loss": -0.0486, + "num_tokens": 4804375.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6132657527923584, + "sampling/importance_sampling_ratio/mean": 1.000281572341919, + "sampling/importance_sampling_ratio/min": 0.6176195740699768, + "sampling/sampling_logp_difference/max": 0.48188257217407227, + "sampling/sampling_logp_difference/mean": 0.014598002657294273, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 212.75, + "completions/mean_terminated_length": 212.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.28690484166145325, + "epoch": 0.19117647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02361950208598998, + "kl": 0.004340208135545254, + "learning_rate": 6.300813008130081e-07, + "loss": 0.0, + "num_tokens": 4834823.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4644800424575806, + "sampling/importance_sampling_ratio/mean": 0.9994724988937378, + "sampling/importance_sampling_ratio/min": 0.6098414659500122, + "sampling/sampling_logp_difference/max": 0.4945563077926636, + "sampling/sampling_logp_difference/mean": 0.014058542437851429, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 176.578125, + "completions/mean_terminated_length": 176.578125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3075827956199646, + "epoch": 0.19240196078431374, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016886477020967572, + "kl": 0.003711746772751212, + "learning_rate": 6.341463414634146e-07, + "loss": 0.0, + "num_tokens": 4866460.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5459957122802734, + "sampling/importance_sampling_ratio/mean": 1.000352144241333, + "sampling/importance_sampling_ratio/min": 0.6635557413101196, + "sampling/sampling_logp_difference/max": 0.4356682300567627, + "sampling/sampling_logp_difference/mean": 0.014879366382956505, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 160.734375, + "completions/mean_terminated_length": 160.734375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.29727303981781006, + "epoch": 0.19362745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018378535930924606, + "kl": 0.00456765852868557, + "learning_rate": 6.382113821138211e-07, + "loss": 0.0, + "num_tokens": 4891451.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 0.9994612336158752, + "sampling/importance_sampling_ratio/min": 0.5615734457969666, + "sampling/sampling_logp_difference/max": 0.5770127773284912, + "sampling/sampling_logp_difference/mean": 0.014178035780787468, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 208.546875, + "completions/mean_terminated_length": 208.546875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.39588308334350586, + "epoch": 0.1948529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016381547211185944, + "kl": 0.00348066003061831, + "learning_rate": 6.422764227642276e-07, + "loss": 0.0, + "num_tokens": 4920494.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3604146242141724, + "sampling/importance_sampling_ratio/mean": 1.0002737045288086, + "sampling/importance_sampling_ratio/min": 0.6240179538726807, + "sampling/sampling_logp_difference/max": 0.4715762138366699, + "sampling/sampling_logp_difference/mean": 0.017580877989530563, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 196.703125, + "completions/mean_terminated_length": 196.703125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.41126543283462524, + "epoch": 0.19607843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9591859348671818, + "kl": 0.0034652426838874817, + "learning_rate": 6.463414634146342e-07, + "loss": -0.0195, + "num_tokens": 4952715.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4795540571212769, + "sampling/importance_sampling_ratio/mean": 1.0006828308105469, + "sampling/importance_sampling_ratio/min": 0.6171398758888245, + "sampling/sampling_logp_difference/max": 0.48265957832336426, + "sampling/sampling_logp_difference/mean": 0.017289428040385246, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 197.109375, + "completions/mean_terminated_length": 197.109375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.34364861249923706, + "epoch": 0.19730392156862744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01722778244946955, + "kl": 0.004390948452055454, + "learning_rate": 6.504065040650406e-07, + "loss": 0.0, + "num_tokens": 4982434.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999923825263977, + "sampling/importance_sampling_ratio/min": 0.6132733821868896, + "sampling/sampling_logp_difference/max": 0.9893083572387695, + "sampling/sampling_logp_difference/mean": 0.01679658517241478, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 218.671875, + "completions/mean_terminated_length": 218.671875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3521959185600281, + "epoch": 0.19852941176470587, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8587819363528818, + "kl": 0.0034514947328716516, + "learning_rate": 6.544715447154471e-07, + "loss": -0.0562, + "num_tokens": 5015149.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.433842658996582, + "sampling/importance_sampling_ratio/mean": 0.9994710683822632, + "sampling/importance_sampling_ratio/min": 0.49541690945625305, + "sampling/sampling_logp_difference/max": 0.7023556232452393, + "sampling/sampling_logp_difference/mean": 0.01686188578605652, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 236.5, + "completions/mean_terminated_length": 236.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.2904425263404846, + "epoch": 0.19975490196078433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012973321770188678, + "kl": 0.003004253376275301, + "learning_rate": 6.585365853658536e-07, + "loss": 0.0, + "num_tokens": 5047293.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4284110069274902, + "sampling/importance_sampling_ratio/mean": 1.0002599954605103, + "sampling/importance_sampling_ratio/min": 0.6202362179756165, + "sampling/sampling_logp_difference/max": 0.47765493392944336, + "sampling/sampling_logp_difference/mean": 0.013213622383773327, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 224.828125, + "completions/mean_terminated_length": 224.828125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.2878662049770355, + "epoch": 0.20098039215686275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012719596988869973, + "kl": 0.0029752333648502827, + "learning_rate": 6.626016260162602e-07, + "loss": 0.0, + "num_tokens": 5094402.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4756325483322144, + "sampling/importance_sampling_ratio/mean": 1.0003304481506348, + "sampling/importance_sampling_ratio/min": 0.5676484704017639, + "sampling/sampling_logp_difference/max": 0.5662529468536377, + "sampling/sampling_logp_difference/mean": 0.013716815039515495, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 217.4375, + "completions/mean_terminated_length": 217.4375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.37395769357681274, + "epoch": 0.20220588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8767704887704821, + "kl": 0.0044336821883916855, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0524, + "num_tokens": 5126030.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.525197148323059, + "sampling/importance_sampling_ratio/mean": 1.0002803802490234, + "sampling/importance_sampling_ratio/min": 0.6158265471458435, + "sampling/sampling_logp_difference/max": 0.4847898483276367, + "sampling/sampling_logp_difference/mean": 0.015268450602889061, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 168.71875, + "completions/mean_terminated_length": 168.71875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3771747946739197, + "epoch": 0.2034313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01791208693210655, + "kl": 0.004233954939991236, + "learning_rate": 6.707317073170731e-07, + "loss": 0.0, + "num_tokens": 5152300.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3672889471054077, + "sampling/importance_sampling_ratio/mean": 0.9996294975280762, + "sampling/importance_sampling_ratio/min": 0.6267502307891846, + "sampling/sampling_logp_difference/max": 0.4672071933746338, + "sampling/sampling_logp_difference/mean": 0.016697824001312256, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 203.796875, + "completions/mean_terminated_length": 203.796875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3788134455680847, + "epoch": 0.20465686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0157473432980807, + "kl": 0.004376332275569439, + "learning_rate": 6.747967479674797e-07, + "loss": 0.0, + "num_tokens": 5182671.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.585753321647644, + "sampling/importance_sampling_ratio/mean": 1.0001531839370728, + "sampling/importance_sampling_ratio/min": 0.6228065490722656, + "sampling/sampling_logp_difference/max": 0.47351932525634766, + "sampling/sampling_logp_difference/mean": 0.015365363098680973, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 217.5625, + "completions/mean_terminated_length": 217.5625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.46515071392059326, + "epoch": 0.20588235294117646, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2459167809280487, + "kl": 0.003541711252182722, + "learning_rate": 6.788617886178861e-07, + "loss": 0.0037, + "num_tokens": 5214211.0, + "reward": -0.03125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.435779333114624, + "sampling/importance_sampling_ratio/mean": 1.0002970695495605, + "sampling/importance_sampling_ratio/min": 0.3252015709877014, + "sampling/sampling_logp_difference/max": 1.1233100891113281, + "sampling/sampling_logp_difference/mean": 0.017395062372088432, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 244.296875, + "completions/mean_terminated_length": 244.296875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.35924825072288513, + "epoch": 0.20710784313725492, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012788952296011735, + "kl": 0.003738091792911291, + "learning_rate": 6.829268292682927e-07, + "loss": 0.0, + "num_tokens": 5247734.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5637272596359253, + "sampling/importance_sampling_ratio/mean": 1.0001357793807983, + "sampling/importance_sampling_ratio/min": 0.5928646326065063, + "sampling/sampling_logp_difference/max": 0.5227892398834229, + "sampling/sampling_logp_difference/mean": 0.015229095704853535, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 219.46875, + "completions/mean_terminated_length": 219.46875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3450343608856201, + "epoch": 0.20833333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9312880034856267, + "kl": 0.0034308377653360367, + "learning_rate": 6.869918699186991e-07, + "loss": -0.0048, + "num_tokens": 5282340.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4391900300979614, + "sampling/importance_sampling_ratio/mean": 0.999631404876709, + "sampling/importance_sampling_ratio/min": 0.546565592288971, + "sampling/sampling_logp_difference/max": 0.6041009426116943, + "sampling/sampling_logp_difference/mean": 0.016199318692088127, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 177.84375, + "completions/mean_terminated_length": 177.84375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.386122465133667, + "epoch": 0.20955882352941177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017727940835066208, + "kl": 0.003923412412405014, + "learning_rate": 6.910569105691057e-07, + "loss": 0.0, + "num_tokens": 5311642.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4683748483657837, + "sampling/importance_sampling_ratio/mean": 0.9991652369499207, + "sampling/importance_sampling_ratio/min": 0.6329178810119629, + "sampling/sampling_logp_difference/max": 0.4574146270751953, + "sampling/sampling_logp_difference/mean": 0.01795331947505474, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 234.734375, + "completions/mean_terminated_length": 234.734375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.33944404125213623, + "epoch": 0.2107843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7468689868955847, + "kl": 0.0033471607603132725, + "learning_rate": 6.951219512195121e-07, + "loss": -0.0085, + "num_tokens": 5348041.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.598752737045288, + "sampling/importance_sampling_ratio/mean": 0.9998255372047424, + "sampling/importance_sampling_ratio/min": 0.637968897819519, + "sampling/sampling_logp_difference/max": 0.4692237377166748, + "sampling/sampling_logp_difference/mean": 0.014106452465057373, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 218.921875, + "completions/mean_terminated_length": 218.921875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.30958008766174316, + "epoch": 0.21200980392156862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012583170300759833, + "kl": 0.003378768917173147, + "learning_rate": 6.991869918699187e-07, + "loss": 0.0, + "num_tokens": 5380228.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5745068788528442, + "sampling/importance_sampling_ratio/mean": 1.0001413822174072, + "sampling/importance_sampling_ratio/min": 0.6016868948936462, + "sampling/sampling_logp_difference/max": 0.5080180168151855, + "sampling/sampling_logp_difference/mean": 0.014329886995255947, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 158.9375, + "completions/mean_terminated_length": 158.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4220223128795624, + "epoch": 0.21323529411764705, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9888867576948446, + "kl": 0.004015155136585236, + "learning_rate": 7.032520325203252e-07, + "loss": -0.0017, + "num_tokens": 5409168.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6091922521591187, + "sampling/importance_sampling_ratio/mean": 0.9998781681060791, + "sampling/importance_sampling_ratio/min": 0.6208694577217102, + "sampling/sampling_logp_difference/max": 0.47663450241088867, + "sampling/sampling_logp_difference/mean": 0.019033968448638916, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 239.890625, + "completions/mean_terminated_length": 239.890625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.32471412420272827, + "epoch": 0.21446078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010540768455384751, + "kl": 0.002627901965752244, + "learning_rate": 7.073170731707316e-07, + "loss": 0.0, + "num_tokens": 5445769.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5332598686218262, + "sampling/importance_sampling_ratio/mean": 1.000173568725586, + "sampling/importance_sampling_ratio/min": 0.675683856010437, + "sampling/sampling_logp_difference/max": 0.4273960590362549, + "sampling/sampling_logp_difference/mean": 0.014171200804412365, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 196.9375, + "completions/mean_terminated_length": 196.9375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4236835539340973, + "epoch": 0.21568627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013264606020078321, + "kl": 0.00399052444845438, + "learning_rate": 7.113821138211382e-07, + "loss": 0.0, + "num_tokens": 5480949.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4288609027862549, + "sampling/importance_sampling_ratio/mean": 0.9993376731872559, + "sampling/importance_sampling_ratio/min": 0.6968380808830261, + "sampling/sampling_logp_difference/max": 0.3612022399902344, + "sampling/sampling_logp_difference/mean": 0.01731596142053604, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 216.5, + "completions/mean_terminated_length": 216.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3174845576286316, + "epoch": 0.21691176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012555111751764091, + "kl": 0.0028034732677042484, + "learning_rate": 7.154471544715447e-07, + "loss": 0.0, + "num_tokens": 5517717.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6263964176177979, + "sampling/importance_sampling_ratio/mean": 1.0002188682556152, + "sampling/importance_sampling_ratio/min": 0.6396260857582092, + "sampling/sampling_logp_difference/max": 0.48636674880981445, + "sampling/sampling_logp_difference/mean": 0.014294363558292389, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 204.046875, + "completions/mean_terminated_length": 204.046875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2718321681022644, + "epoch": 0.2181372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011483939277446507, + "kl": 0.00246023153886199, + "learning_rate": 7.195121951219512e-07, + "loss": 0.0, + "num_tokens": 5551096.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3715288639068604, + "sampling/importance_sampling_ratio/mean": 0.9999605417251587, + "sampling/importance_sampling_ratio/min": 0.6878846883773804, + "sampling/sampling_logp_difference/max": 0.3741340637207031, + "sampling/sampling_logp_difference/mean": 0.013501793146133423, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 195.578125, + "completions/mean_terminated_length": 195.578125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3303038775920868, + "epoch": 0.2193627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0069651822560624, + "kl": 0.0032733306288719177, + "learning_rate": 7.235772357723577e-07, + "loss": 0.0362, + "num_tokens": 5588973.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4814108610153198, + "sampling/importance_sampling_ratio/mean": 0.9997945427894592, + "sampling/importance_sampling_ratio/min": 0.4739038646221161, + "sampling/sampling_logp_difference/max": 0.7467508316040039, + "sampling/sampling_logp_difference/mean": 0.014692796394228935, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.41069531440734863, + "epoch": 0.22058823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3343516607581438, + "kl": 0.0040096924640238285, + "learning_rate": 7.276422764227642e-07, + "loss": -0.0364, + "num_tokens": 5619933.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4864038228988647, + "sampling/importance_sampling_ratio/mean": 0.9999009370803833, + "sampling/importance_sampling_ratio/min": 0.6505169868469238, + "sampling/sampling_logp_difference/max": 0.42998790740966797, + "sampling/sampling_logp_difference/mean": 0.018758460879325867, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 203.5625, + "completions/mean_terminated_length": 203.5625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3594907224178314, + "epoch": 0.22181372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.310196378375213, + "kl": 0.00374414399266243, + "learning_rate": 7.317073170731707e-07, + "loss": -0.0079, + "num_tokens": 5648913.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5919442176818848, + "sampling/importance_sampling_ratio/mean": 1.0004675388336182, + "sampling/importance_sampling_ratio/min": 0.6224140524864197, + "sampling/sampling_logp_difference/max": 0.4741497039794922, + "sampling/sampling_logp_difference/mean": 0.01597871072590351, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 193.1875, + "completions/mean_terminated_length": 193.1875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3262665271759033, + "epoch": 0.22303921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017245806061220382, + "kl": 0.0042599341832101345, + "learning_rate": 7.357723577235772e-07, + "loss": 0.0, + "num_tokens": 5683837.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.506589412689209, + "sampling/importance_sampling_ratio/mean": 0.9992817640304565, + "sampling/importance_sampling_ratio/min": 0.6075704097747803, + "sampling/sampling_logp_difference/max": 0.4982872009277344, + "sampling/sampling_logp_difference/mean": 0.01512528583407402, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 181.078125, + "completions/mean_terminated_length": 181.078125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.32310307025909424, + "epoch": 0.22426470588235295, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012105599319543987, + "kl": 0.003585977014154196, + "learning_rate": 7.398373983739837e-07, + "loss": 0.0, + "num_tokens": 5715874.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4722709655761719, + "sampling/importance_sampling_ratio/mean": 1.0006076097488403, + "sampling/importance_sampling_ratio/min": 0.6348845958709717, + "sampling/sampling_logp_difference/max": 0.4543120861053467, + "sampling/sampling_logp_difference/mean": 0.01581915281713009, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 193.015625, + "completions/mean_terminated_length": 193.015625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.40230071544647217, + "epoch": 0.22549019607843138, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1221218071458032, + "kl": 0.004614609759300947, + "learning_rate": 7.439024390243903e-07, + "loss": 0.0372, + "num_tokens": 5743107.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 0.9996156692504883, + "sampling/importance_sampling_ratio/min": 0.6805951595306396, + "sampling/sampling_logp_difference/max": 0.4682176113128662, + "sampling/sampling_logp_difference/mean": 0.01691802218556404, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 181.578125, + "completions/mean_terminated_length": 181.578125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.46721404790878296, + "epoch": 0.2267156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4089179931957916, + "kl": 0.006852032616734505, + "learning_rate": 7.479674796747967e-07, + "loss": 0.0262, + "num_tokens": 5772184.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6012070178985596, + "sampling/importance_sampling_ratio/mean": 0.999984860420227, + "sampling/importance_sampling_ratio/min": 0.6403137445449829, + "sampling/sampling_logp_difference/max": 0.47075772285461426, + "sampling/sampling_logp_difference/mean": 0.01858537644147873, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 180.03125, + "completions/mean_terminated_length": 180.03125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3536166846752167, + "epoch": 0.22794117647058823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013904942323311334, + "kl": 0.004634760785847902, + "learning_rate": 7.520325203252032e-07, + "loss": 0.0, + "num_tokens": 5804314.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5369963645935059, + "sampling/importance_sampling_ratio/mean": 1.0005981922149658, + "sampling/importance_sampling_ratio/min": 0.6708944439888, + "sampling/sampling_logp_difference/max": 0.42983007431030273, + "sampling/sampling_logp_difference/mean": 0.015495835803449154, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 213.890625, + "completions/mean_terminated_length": 213.890625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.44987189769744873, + "epoch": 0.22916666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9168057676492665, + "kl": 0.005267436150461435, + "learning_rate": 7.560975609756097e-07, + "loss": 0.0137, + "num_tokens": 5836579.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3781383037567139, + "sampling/importance_sampling_ratio/mean": 0.9997191429138184, + "sampling/importance_sampling_ratio/min": 0.5998452305793762, + "sampling/sampling_logp_difference/max": 0.5110836029052734, + "sampling/sampling_logp_difference/mean": 0.017986461520195007, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 218.234375, + "completions/mean_terminated_length": 218.234375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4195953607559204, + "epoch": 0.23039215686274508, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0181818507958051, + "kl": 0.004643009044229984, + "learning_rate": 7.601626016260162e-07, + "loss": -0.0348, + "num_tokens": 5877618.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002375841140747, + "sampling/importance_sampling_ratio/min": 0.6030285954475403, + "sampling/sampling_logp_difference/max": 1.470839500427246, + "sampling/sampling_logp_difference/mean": 0.017538927495479584, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 186.34375, + "completions/mean_terminated_length": 186.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.38808345794677734, + "epoch": 0.23161764705882354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01810426589484519, + "kl": 0.005774513818323612, + "learning_rate": 7.642276422764228e-07, + "loss": 0.0001, + "num_tokens": 5905400.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4132622480392456, + "sampling/importance_sampling_ratio/mean": 0.9998592734336853, + "sampling/importance_sampling_ratio/min": 0.6387045979499817, + "sampling/sampling_logp_difference/max": 0.44831323623657227, + "sampling/sampling_logp_difference/mean": 0.016829874366521835, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 129.3125, + "completions/mean_terminated_length": 129.3125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3367841839790344, + "epoch": 0.23284313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4615088240305407, + "kl": 0.009488044306635857, + "learning_rate": 7.682926829268292e-07, + "loss": -0.0405, + "num_tokens": 5928972.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4128563404083252, + "sampling/importance_sampling_ratio/mean": 0.9995773434638977, + "sampling/importance_sampling_ratio/min": 0.5278366208076477, + "sampling/sampling_logp_difference/max": 0.6389684677124023, + "sampling/sampling_logp_difference/mean": 0.017004843801259995, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 212.15625, + "completions/mean_terminated_length": 212.15625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.40614062547683716, + "epoch": 0.2340686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019031274405640942, + "kl": 0.00624141376465559, + "learning_rate": 7.723577235772358e-07, + "loss": 0.0001, + "num_tokens": 5964614.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7942405939102173, + "sampling/importance_sampling_ratio/mean": 0.9996316432952881, + "sampling/importance_sampling_ratio/min": 0.6298720836639404, + "sampling/sampling_logp_difference/max": 0.5845818519592285, + "sampling/sampling_logp_difference/mean": 0.017022619023919106, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 224.171875, + "completions/mean_terminated_length": 224.171875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.37709319591522217, + "epoch": 0.23529411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8466587922316804, + "kl": 0.006414324976503849, + "learning_rate": 7.764227642276422e-07, + "loss": -0.0201, + "num_tokens": 5994145.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5532957315444946, + "sampling/importance_sampling_ratio/mean": 0.9999184012413025, + "sampling/importance_sampling_ratio/min": 0.6281425356864929, + "sampling/sampling_logp_difference/max": 0.46498823165893555, + "sampling/sampling_logp_difference/mean": 0.015568745322525501, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 289.28125, + "completions/mean_terminated_length": 289.28125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3816249668598175, + "epoch": 0.23651960784313725, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7064395936203522, + "kl": 0.0034015390556305647, + "learning_rate": 7.804878048780488e-07, + "loss": 0.0145, + "num_tokens": 6037539.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.498374342918396, + "sampling/importance_sampling_ratio/mean": 0.9999957084655762, + "sampling/importance_sampling_ratio/min": 0.6288349032402039, + "sampling/sampling_logp_difference/max": 0.4638864994049072, + "sampling/sampling_logp_difference/mean": 0.01520681381225586, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 241.546875, + "completions/mean_terminated_length": 241.546875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3961125612258911, + "epoch": 0.23774509803921567, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.782659026683507, + "kl": 0.006137443706393242, + "learning_rate": 7.845528455284552e-07, + "loss": -0.0032, + "num_tokens": 6069014.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.484194040298462, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.5979025363922119, + "sampling/sampling_logp_difference/max": 0.5143275260925293, + "sampling/sampling_logp_difference/mean": 0.01631583273410797, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 251.78125, + "completions/mean_terminated_length": 251.78125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.37602466344833374, + "epoch": 0.23897058823529413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014806989123684491, + "kl": 0.0050431289710104465, + "learning_rate": 7.886178861788617e-07, + "loss": 0.0, + "num_tokens": 6104072.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7715590000152588, + "sampling/importance_sampling_ratio/mean": 1.0002853870391846, + "sampling/importance_sampling_ratio/min": 0.6202758550643921, + "sampling/sampling_logp_difference/max": 0.5718599557876587, + "sampling/sampling_logp_difference/mean": 0.015946928411722183, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 181.453125, + "completions/mean_terminated_length": 181.453125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.31748703122138977, + "epoch": 0.24019607843137256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022062487150172987, + "kl": 0.006586791016161442, + "learning_rate": 7.926829268292683e-07, + "loss": 0.0001, + "num_tokens": 6132069.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5275835990905762, + "sampling/importance_sampling_ratio/mean": 0.9999291300773621, + "sampling/importance_sampling_ratio/min": 0.6065285205841064, + "sampling/sampling_logp_difference/max": 0.5000035762786865, + "sampling/sampling_logp_difference/mean": 0.015199665911495686, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 198.5625, + "completions/mean_terminated_length": 198.5625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3671415448188782, + "epoch": 0.24142156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01974363161673683, + "kl": 0.006311338860541582, + "learning_rate": 7.967479674796747e-07, + "loss": 0.0001, + "num_tokens": 6164473.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.367321252822876, + "sampling/importance_sampling_ratio/mean": 1.000627040863037, + "sampling/importance_sampling_ratio/min": 0.6262636184692383, + "sampling/sampling_logp_difference/max": 0.4679839611053467, + "sampling/sampling_logp_difference/mean": 0.014926253817975521, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 184.546875, + "completions/mean_terminated_length": 184.546875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.35503798723220825, + "epoch": 0.2426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9855354953030266, + "kl": 0.0070960987359285355, + "learning_rate": 8.008130081300813e-07, + "loss": 0.0046, + "num_tokens": 6189756.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.8739265203475952, + "sampling/importance_sampling_ratio/mean": 1.000748634338379, + "sampling/importance_sampling_ratio/min": 0.6033843755722046, + "sampling/sampling_logp_difference/max": 0.6280360221862793, + "sampling/sampling_logp_difference/mean": 0.01589926704764366, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 214.265625, + "completions/mean_terminated_length": 214.265625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.37239372730255127, + "epoch": 0.24387254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0217863839072463, + "kl": 0.006487885490059853, + "learning_rate": 8.048780487804878e-07, + "loss": -0.0282, + "num_tokens": 6222333.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.320409893989563, + "sampling/importance_sampling_ratio/mean": 0.9995848536491394, + "sampling/importance_sampling_ratio/min": 0.654735803604126, + "sampling/sampling_logp_difference/max": 0.4235234260559082, + "sampling/sampling_logp_difference/mean": 0.014400139451026917, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 226.921875, + "completions/mean_terminated_length": 226.921875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.34588587284088135, + "epoch": 0.24509803921568626, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015066589558551828, + "kl": 0.00500678364187479, + "learning_rate": 8.089430894308943e-07, + "loss": 0.0, + "num_tokens": 6262344.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9079806804656982, + "sampling/importance_sampling_ratio/mean": 1.000096321105957, + "sampling/importance_sampling_ratio/min": 0.4871023893356323, + "sampling/sampling_logp_difference/max": 0.7192809581756592, + "sampling/sampling_logp_difference/mean": 0.014446980319917202, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 200.984375, + "completions/mean_terminated_length": 200.984375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.4419822096824646, + "epoch": 0.24632352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017528080005748545, + "kl": 0.005993842612951994, + "learning_rate": 8.130081300813008e-07, + "loss": 0.0001, + "num_tokens": 6292359.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4518470764160156, + "sampling/importance_sampling_ratio/mean": 1.000361442565918, + "sampling/importance_sampling_ratio/min": 0.5999310612678528, + "sampling/sampling_logp_difference/max": 0.5109405517578125, + "sampling/sampling_logp_difference/mean": 0.017505858093500137, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 227.375, + "completions/mean_terminated_length": 227.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3625558614730835, + "epoch": 0.24754901960784315, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4113681466450303, + "kl": 0.0048790015280246735, + "learning_rate": 8.170731707317072e-07, + "loss": -0.0063, + "num_tokens": 6326223.0, + "reward": 0.71875, + "reward_std": 0.5722135901451111, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.326668381690979, + "sampling/importance_sampling_ratio/mean": 1.000248670578003, + "sampling/importance_sampling_ratio/min": 0.60129714012146, + "sampling/sampling_logp_difference/max": 0.5086660385131836, + "sampling/sampling_logp_difference/mean": 0.014618618413805962, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 132.265625, + "completions/mean_terminated_length": 132.265625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.33129891753196716, + "epoch": 0.24877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02401984277029526, + "kl": 0.0077463616617023945, + "learning_rate": 8.211382113821138e-07, + "loss": 0.0001, + "num_tokens": 6349680.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6219075918197632, + "sampling/importance_sampling_ratio/mean": 0.999733567237854, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.48360300064086914, + "sampling/sampling_logp_difference/mean": 0.015228422358632088, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 161.734375, + "completions/mean_terminated_length": 161.734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3452780246734619, + "epoch": 0.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018051280744605103, + "kl": 0.005515796132385731, + "learning_rate": 8.252032520325202e-07, + "loss": 0.0001, + "num_tokens": 6381487.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4702109098434448, + "sampling/importance_sampling_ratio/mean": 1.0000395774841309, + "sampling/importance_sampling_ratio/min": 0.5659708380699158, + "sampling/sampling_logp_difference/max": 0.5692126750946045, + "sampling/sampling_logp_difference/mean": 0.01679180935025215, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 205.28125, + "completions/mean_terminated_length": 205.28125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3766824007034302, + "epoch": 0.2512254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7310462494103193, + "kl": 0.006803940050303936, + "learning_rate": 8.292682926829268e-07, + "loss": 0.0159, + "num_tokens": 6410017.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6630347967147827, + "sampling/importance_sampling_ratio/mean": 1.000176191329956, + "sampling/importance_sampling_ratio/min": 0.6379572153091431, + "sampling/sampling_logp_difference/max": 0.5086441040039062, + "sampling/sampling_logp_difference/mean": 0.014964250847697258, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 183.375, + "completions/mean_terminated_length": 183.375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.3047809898853302, + "epoch": 0.25245098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018410139615306992, + "kl": 0.006978918798267841, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0001, + "num_tokens": 6441881.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.57449209690094, + "sampling/importance_sampling_ratio/mean": 0.9996367692947388, + "sampling/importance_sampling_ratio/min": 0.6156854033470154, + "sampling/sampling_logp_difference/max": 0.4850192070007324, + "sampling/sampling_logp_difference/mean": 0.014237132854759693, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 244.40625, + "completions/mean_terminated_length": 244.40625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4763853847980499, + "epoch": 0.2536764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.742727042758716, + "kl": 0.0060688345693051815, + "learning_rate": 8.373983739837398e-07, + "loss": -0.0075, + "num_tokens": 6475699.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6098331212997437, + "sampling/importance_sampling_ratio/mean": 1.000468373298645, + "sampling/importance_sampling_ratio/min": 0.6110594272613525, + "sampling/sampling_logp_difference/max": 0.49256110191345215, + "sampling/sampling_logp_difference/mean": 0.017908930778503418, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 195.4375, + "completions/mean_terminated_length": 195.4375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4074808657169342, + "epoch": 0.2549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9959708854852373, + "kl": 0.00976475328207016, + "learning_rate": 8.414634146341463e-07, + "loss": -0.0133, + "num_tokens": 6502783.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5459479093551636, + "sampling/importance_sampling_ratio/mean": 1.0006380081176758, + "sampling/importance_sampling_ratio/min": 0.62617027759552, + "sampling/sampling_logp_difference/max": 0.46813297271728516, + "sampling/sampling_logp_difference/mean": 0.016537927091121674, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 157.953125, + "completions/mean_terminated_length": 157.953125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.4467363953590393, + "epoch": 0.25612745098039214, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1726912773808704, + "kl": 0.009501153603196144, + "learning_rate": 8.455284552845529e-07, + "loss": 0.0232, + "num_tokens": 6527932.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4439783096313477, + "sampling/importance_sampling_ratio/mean": 1.0001927614212036, + "sampling/importance_sampling_ratio/min": 0.6232355237007141, + "sampling/sampling_logp_difference/max": 0.47283077239990234, + "sampling/sampling_logp_difference/mean": 0.019368894398212433, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 179.359375, + "completions/mean_terminated_length": 179.359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3574153482913971, + "epoch": 0.25735294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1452082427598702, + "kl": 0.007702191825956106, + "learning_rate": 8.495934959349593e-07, + "loss": -0.0059, + "num_tokens": 6557443.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.549471139907837, + "sampling/importance_sampling_ratio/mean": 1.0003321170806885, + "sampling/importance_sampling_ratio/min": 0.7380995154380798, + "sampling/sampling_logp_difference/max": 0.4379136562347412, + "sampling/sampling_logp_difference/mean": 0.014593801461160183, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 195.640625, + "completions/mean_terminated_length": 195.640625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4797608256340027, + "epoch": 0.25857843137254904, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.786825032092835, + "kl": 0.009710676968097687, + "learning_rate": 8.536585365853657e-07, + "loss": -0.0074, + "num_tokens": 6587948.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 0.9994493722915649, + "sampling/importance_sampling_ratio/min": 0.6035121083259583, + "sampling/sampling_logp_difference/max": 0.5049891471862793, + "sampling/sampling_logp_difference/mean": 0.018817134201526642, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 180.5, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4090731739997864, + "epoch": 0.25980392156862747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02247531717029179, + "kl": 0.008128427900373936, + "learning_rate": 8.577235772357723e-07, + "loss": 0.0001, + "num_tokens": 6613948.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4769021272659302, + "sampling/importance_sampling_ratio/mean": 0.9996882677078247, + "sampling/importance_sampling_ratio/min": 0.6154875159263611, + "sampling/sampling_logp_difference/max": 0.48534059524536133, + "sampling/sampling_logp_difference/mean": 0.01656338945031166, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 226.0, + "completions/mean_terminated_length": 226.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4429599642753601, + "epoch": 0.2610294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.875112258366726, + "kl": 0.007630010601133108, + "learning_rate": 8.617886178861788e-07, + "loss": -0.0137, + "num_tokens": 6650524.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7480331659317017, + "sampling/importance_sampling_ratio/mean": 1.0001615285873413, + "sampling/importance_sampling_ratio/min": 0.6175042986869812, + "sampling/sampling_logp_difference/max": 0.5584912300109863, + "sampling/sampling_logp_difference/mean": 0.01645074039697647, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 244.96875, + "completions/mean_terminated_length": 244.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.45601630210876465, + "epoch": 0.2622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01924254280240346, + "kl": 0.00718282163143158, + "learning_rate": 8.658536585365853e-07, + "loss": 0.0001, + "num_tokens": 6690858.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.699453592300415, + "sampling/importance_sampling_ratio/mean": 0.9991973638534546, + "sampling/importance_sampling_ratio/min": 0.6268161535263062, + "sampling/sampling_logp_difference/max": 0.5303068161010742, + "sampling/sampling_logp_difference/mean": 0.016671188175678253, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 182.21875, + "completions/mean_terminated_length": 182.21875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.41794514656066895, + "epoch": 0.26348039215686275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025044838988155363, + "kl": 0.009658871218562126, + "learning_rate": 8.699186991869918e-07, + "loss": 0.0001, + "num_tokens": 6730216.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.873031497001648, + "sampling/importance_sampling_ratio/mean": 0.9994128942489624, + "sampling/importance_sampling_ratio/min": 0.6774318814277649, + "sampling/sampling_logp_difference/max": 0.6275582313537598, + "sampling/sampling_logp_difference/mean": 0.01576501503586769, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 246.5, + "completions/mean_terminated_length": 246.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3788728713989258, + "epoch": 0.2647058823529412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021093975561889924, + "kl": 0.009017504751682281, + "learning_rate": 8.739837398373984e-07, + "loss": 0.0001, + "num_tokens": 6764600.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4169543981552124, + "sampling/importance_sampling_ratio/mean": 1.000279188156128, + "sampling/importance_sampling_ratio/min": 0.5626490712165833, + "sampling/sampling_logp_difference/max": 0.5750991106033325, + "sampling/sampling_logp_difference/mean": 0.014173740521073341, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 204.015625, + "completions/mean_terminated_length": 204.015625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.389233261346817, + "epoch": 0.2659313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024991982444765946, + "kl": 0.011074772104620934, + "learning_rate": 8.780487804878048e-07, + "loss": 0.0001, + "num_tokens": 6796729.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.401310920715332, + "sampling/importance_sampling_ratio/mean": 1.0005078315734863, + "sampling/importance_sampling_ratio/min": 0.677149772644043, + "sampling/sampling_logp_difference/max": 0.3898627758026123, + "sampling/sampling_logp_difference/mean": 0.014612487517297268, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 219.15625, + "completions/mean_terminated_length": 219.15625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3317946195602417, + "epoch": 0.26715686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021233170321532966, + "kl": 0.008910744450986385, + "learning_rate": 8.821138211382113e-07, + "loss": 0.0001, + "num_tokens": 6831971.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.327485203742981, + "sampling/importance_sampling_ratio/mean": 0.9994485378265381, + "sampling/importance_sampling_ratio/min": 0.6925040483474731, + "sampling/sampling_logp_difference/max": 0.36744117736816406, + "sampling/sampling_logp_difference/mean": 0.01264607347548008, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 196.6875, + "completions/mean_terminated_length": 196.6875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4074782133102417, + "epoch": 0.26838235294117646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035860313956660055, + "kl": 0.013461483642458916, + "learning_rate": 8.861788617886179e-07, + "loss": 0.0001, + "num_tokens": 6861071.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4998352527618408, + "sampling/importance_sampling_ratio/mean": 0.9996261596679688, + "sampling/importance_sampling_ratio/min": 0.614910364151001, + "sampling/sampling_logp_difference/max": 0.486278772354126, + "sampling/sampling_logp_difference/mean": 0.01648368313908577, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 210.859375, + "completions/mean_terminated_length": 210.859375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4149256944656372, + "epoch": 0.2696078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2233127015137235, + "kl": 0.011519803665578365, + "learning_rate": 8.902439024390244e-07, + "loss": -0.0113, + "num_tokens": 6892294.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4479440450668335, + "sampling/importance_sampling_ratio/mean": 1.0004557371139526, + "sampling/importance_sampling_ratio/min": 0.6886289715766907, + "sampling/sampling_logp_difference/max": 0.37305259704589844, + "sampling/sampling_logp_difference/mean": 0.01560588926076889, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 202.28125, + "completions/mean_terminated_length": 202.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.5796515941619873, + "epoch": 0.2708333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9406076674499578, + "kl": 0.012280515395104885, + "learning_rate": 8.943089430894308e-07, + "loss": 0.0045, + "num_tokens": 6920904.0, + "reward": -0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.3890950679779053, + "sampling/importance_sampling_ratio/mean": 0.9999018907546997, + "sampling/importance_sampling_ratio/min": 0.6918495297431946, + "sampling/sampling_logp_difference/max": 0.36838674545288086, + "sampling/sampling_logp_difference/mean": 0.018281951546669006, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 177.34375, + "completions/mean_terminated_length": 177.34375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.38370460271835327, + "epoch": 0.27205882352941174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02551354122962791, + "kl": 0.011651578359305859, + "learning_rate": 8.983739837398373e-07, + "loss": 0.0001, + "num_tokens": 6952062.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6262208223342896, + "sampling/importance_sampling_ratio/mean": 1.0000596046447754, + "sampling/importance_sampling_ratio/min": 0.6893720030784607, + "sampling/sampling_logp_difference/max": 0.48625874519348145, + "sampling/sampling_logp_difference/mean": 0.014482217840850353, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 227.609375, + "completions/mean_terminated_length": 227.609375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.40940508246421814, + "epoch": 0.27328431372549017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02309105560794474, + "kl": 0.00955723226070404, + "learning_rate": 9.024390243902439e-07, + "loss": 0.0001, + "num_tokens": 6985877.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4982682466506958, + "sampling/importance_sampling_ratio/mean": 0.999754011631012, + "sampling/importance_sampling_ratio/min": 0.6371425986289978, + "sampling/sampling_logp_difference/max": 0.4507617950439453, + "sampling/sampling_logp_difference/mean": 0.0144026018679142, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 205.328125, + "completions/mean_terminated_length": 205.328125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.4640798568725586, + "epoch": 0.27450980392156865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025321638175525355, + "kl": 0.010854870080947876, + "learning_rate": 9.065040650406503e-07, + "loss": 0.0001, + "num_tokens": 7018010.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5766310691833496, + "sampling/importance_sampling_ratio/mean": 0.9997616410255432, + "sampling/importance_sampling_ratio/min": 0.6246612668037415, + "sampling/sampling_logp_difference/max": 0.47054576873779297, + "sampling/sampling_logp_difference/mean": 0.01709725335240364, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 216.078125, + "completions/mean_terminated_length": 216.078125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3811000883579254, + "epoch": 0.2757352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025339046862078777, + "kl": 0.011608104221522808, + "learning_rate": 9.105691056910569e-07, + "loss": 0.0001, + "num_tokens": 7047103.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4439653158187866, + "sampling/importance_sampling_ratio/mean": 0.9998691082000732, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.4335259199142456, + "sampling/sampling_logp_difference/mean": 0.014265717938542366, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 205.359375, + "completions/mean_terminated_length": 205.359375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4888272285461426, + "epoch": 0.2769607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9265821285741606, + "kl": 0.013852481730282307, + "learning_rate": 9.146341463414634e-07, + "loss": 0.0056, + "num_tokens": 7076422.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.3700966835021973, + "sampling/importance_sampling_ratio/mean": 0.9996808767318726, + "sampling/importance_sampling_ratio/min": 0.7715784907341003, + "sampling/sampling_logp_difference/max": 0.3148813247680664, + "sampling/sampling_logp_difference/mean": 0.01662006415426731, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 180.9375, + "completions/mean_terminated_length": 180.9375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3419240117073059, + "epoch": 0.27818627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03017489332894213, + "kl": 0.014357686042785645, + "learning_rate": 9.186991869918699e-07, + "loss": 0.0001, + "num_tokens": 7105426.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5626256465911865, + "sampling/importance_sampling_ratio/mean": 0.999943733215332, + "sampling/importance_sampling_ratio/min": 0.6393700242042542, + "sampling/sampling_logp_difference/max": 0.4472719430923462, + "sampling/sampling_logp_difference/mean": 0.014091861434280872, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 238.265625, + "completions/mean_terminated_length": 238.265625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.38244467973709106, + "epoch": 0.27941176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021481967245729417, + "kl": 0.009679099544882774, + "learning_rate": 9.227642276422763e-07, + "loss": 0.0001, + "num_tokens": 7142787.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4060695171356201, + "sampling/importance_sampling_ratio/mean": 1.0001033544540405, + "sampling/importance_sampling_ratio/min": 0.7145366072654724, + "sampling/sampling_logp_difference/max": 0.3407982587814331, + "sampling/sampling_logp_difference/mean": 0.013826992362737656, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 206.09375, + "completions/mean_terminated_length": 206.09375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.41250723600387573, + "epoch": 0.2806372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6999583230241203, + "kl": 0.011442895978689194, + "learning_rate": 9.26829268292683e-07, + "loss": 0.003, + "num_tokens": 7175609.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4759176969528198, + "sampling/importance_sampling_ratio/mean": 0.9998277425765991, + "sampling/importance_sampling_ratio/min": 0.6153390407562256, + "sampling/sampling_logp_difference/max": 0.4855818748474121, + "sampling/sampling_logp_difference/mean": 0.01563001424074173, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 215.171875, + "completions/mean_terminated_length": 215.171875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.47626060247421265, + "epoch": 0.2818627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02398809377788757, + "kl": 0.0111317690461874, + "learning_rate": 9.308943089430894e-07, + "loss": 0.0001, + "num_tokens": 7219956.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6365751028060913, + "sampling/importance_sampling_ratio/mean": 1.0000046491622925, + "sampling/importance_sampling_ratio/min": 0.6299163699150085, + "sampling/sampling_logp_difference/max": 0.49260568618774414, + "sampling/sampling_logp_difference/mean": 0.017304297536611557, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 221.953125, + "completions/mean_terminated_length": 221.953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3158389925956726, + "epoch": 0.28308823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016257277204131226, + "kl": 0.006831689737737179, + "learning_rate": 9.349593495934958e-07, + "loss": 0.0001, + "num_tokens": 7251873.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997228384017944, + "sampling/importance_sampling_ratio/min": 0.6526646614074707, + "sampling/sampling_logp_difference/max": 0.7052080631256104, + "sampling/sampling_logp_difference/mean": 0.011903062462806702, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 178.953125, + "completions/mean_terminated_length": 178.953125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4166858196258545, + "epoch": 0.28431372549019607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024739217097453914, + "kl": 0.010809991508722305, + "learning_rate": 9.390243902439024e-07, + "loss": 0.0001, + "num_tokens": 7283470.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.39656400680542, + "sampling/importance_sampling_ratio/mean": 1.0000054836273193, + "sampling/importance_sampling_ratio/min": 0.4615365266799927, + "sampling/sampling_logp_difference/max": 0.7731940746307373, + "sampling/sampling_logp_difference/mean": 0.01575036346912384, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 207.078125, + "completions/mean_terminated_length": 207.078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.42489081621170044, + "epoch": 0.2855392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8525932508928461, + "kl": 0.011268509551882744, + "learning_rate": 9.430894308943089e-07, + "loss": -0.0118, + "num_tokens": 7311683.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3816370964050293, + "sampling/importance_sampling_ratio/mean": 1.0003949403762817, + "sampling/importance_sampling_ratio/min": 0.6659024357795715, + "sampling/sampling_logp_difference/max": 0.4066121578216553, + "sampling/sampling_logp_difference/mean": 0.016137288883328438, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 281.765625, + "completions/mean_terminated_length": 281.765625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.44613972306251526, + "epoch": 0.2867647058823529, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8742479747553392, + "kl": 0.0076088979840278625, + "learning_rate": 9.471544715447154e-07, + "loss": -0.0017, + "num_tokens": 7352052.0, + "reward": 0.5625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9099587202072144, + "sampling/importance_sampling_ratio/mean": 1.000007152557373, + "sampling/importance_sampling_ratio/min": 0.4805276095867157, + "sampling/sampling_logp_difference/max": 0.7328705787658691, + "sampling/sampling_logp_difference/mean": 0.014782894402742386, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 183.6875, + "completions/mean_terminated_length": 183.6875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3496655821800232, + "epoch": 0.28799019607843135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02338081316659742, + "kl": 0.00958043709397316, + "learning_rate": 9.512195121951218e-07, + "loss": 0.0001, + "num_tokens": 7383184.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.494954228401184, + "sampling/importance_sampling_ratio/mean": 1.0002262592315674, + "sampling/importance_sampling_ratio/min": 0.695496678352356, + "sampling/sampling_logp_difference/max": 0.4020955562591553, + "sampling/sampling_logp_difference/mean": 0.014258678071200848, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 159.953125, + "completions/mean_terminated_length": 159.953125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.45076289772987366, + "epoch": 0.28921568627450983, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6534245681919217, + "kl": 0.01330963708460331, + "learning_rate": 9.552845528455285e-07, + "loss": 0.047, + "num_tokens": 7407949.0, + "reward": 0.1875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6504887342453003, + "sampling/importance_sampling_ratio/mean": 1.0006005764007568, + "sampling/importance_sampling_ratio/min": 0.6347039937973022, + "sampling/sampling_logp_difference/max": 0.5010714530944824, + "sampling/sampling_logp_difference/mean": 0.019480448216199875, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 202.34375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4706074893474579, + "epoch": 0.29044117647058826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02233542927478274, + "kl": 0.009093056432902813, + "learning_rate": 9.59349593495935e-07, + "loss": 0.0001, + "num_tokens": 7455491.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3603936433792114, + "sampling/importance_sampling_ratio/mean": 1.000697135925293, + "sampling/importance_sampling_ratio/min": 0.6341907978057861, + "sampling/sampling_logp_difference/max": 0.45540547370910645, + "sampling/sampling_logp_difference/mean": 0.017054909840226173, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 184.8125, + "completions/mean_terminated_length": 184.8125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.40207499265670776, + "epoch": 0.2916666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0504165522851772, + "kl": 0.009620942175388336, + "learning_rate": 9.634146341463414e-07, + "loss": 0.013, + "num_tokens": 7481543.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5662195682525635, + "sampling/importance_sampling_ratio/mean": 1.0004643201828003, + "sampling/importance_sampling_ratio/min": 0.6290379762649536, + "sampling/sampling_logp_difference/max": 0.4635636806488037, + "sampling/sampling_logp_difference/mean": 0.01674029976129532, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 194.296875, + "completions/mean_terminated_length": 194.296875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.37485432624816895, + "epoch": 0.2928921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021978196293519697, + "kl": 0.008883722126483917, + "learning_rate": 9.67479674796748e-07, + "loss": 0.0001, + "num_tokens": 7514042.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.576985239982605, + "sampling/importance_sampling_ratio/mean": 1.000411033630371, + "sampling/importance_sampling_ratio/min": 0.657006025314331, + "sampling/sampling_logp_difference/max": 0.45551490783691406, + "sampling/sampling_logp_difference/mean": 0.016208358108997345, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 217.609375, + "completions/mean_terminated_length": 217.609375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3677450716495514, + "epoch": 0.29411764705882354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01726305153614504, + "kl": 0.006754512898623943, + "learning_rate": 9.715447154471544e-07, + "loss": 0.0001, + "num_tokens": 7545217.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.530495047569275, + "sampling/importance_sampling_ratio/mean": 0.9997333884239197, + "sampling/importance_sampling_ratio/min": 0.541140615940094, + "sampling/sampling_logp_difference/max": 0.6140761375427246, + "sampling/sampling_logp_difference/mean": 0.014982339926064014, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 187.515625, + "completions/mean_terminated_length": 187.515625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3908500075340271, + "epoch": 0.29534313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018932848100693202, + "kl": 0.00762729998677969, + "learning_rate": 9.756097560975609e-07, + "loss": 0.0001, + "num_tokens": 7573250.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.508407711982727, + "sampling/importance_sampling_ratio/mean": 0.9994632005691528, + "sampling/importance_sampling_ratio/min": 0.630445122718811, + "sampling/sampling_logp_difference/max": 0.46132922172546387, + "sampling/sampling_logp_difference/mean": 0.01583741419017315, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 216.390625, + "completions/mean_terminated_length": 216.390625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.42570817470550537, + "epoch": 0.2965686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9179455485735144, + "kl": 0.007978806272149086, + "learning_rate": 9.796747967479673e-07, + "loss": 0.0111, + "num_tokens": 7612923.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998685121536255, + "sampling/importance_sampling_ratio/min": 0.6407722234725952, + "sampling/sampling_logp_difference/max": 0.8378071784973145, + "sampling/sampling_logp_difference/mean": 0.01628207042813301, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 173.921875, + "completions/mean_terminated_length": 173.921875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.40627628564834595, + "epoch": 0.2977941176470588, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6958476637616493, + "kl": 0.008724816143512726, + "learning_rate": 9.83739837398374e-07, + "loss": 0.0474, + "num_tokens": 7639942.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6157478094100952, + "sampling/importance_sampling_ratio/mean": 0.999195396900177, + "sampling/importance_sampling_ratio/min": 0.6374641060829163, + "sampling/sampling_logp_difference/max": 0.4797978401184082, + "sampling/sampling_logp_difference/mean": 0.01582256704568863, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 168.8125, + "completions/mean_terminated_length": 168.8125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.40013957023620605, + "epoch": 0.29901960784313725, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2749960429030391, + "kl": 0.009192371740937233, + "learning_rate": 9.878048780487804e-07, + "loss": 0.0099, + "num_tokens": 7667978.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4499542713165283, + "sampling/importance_sampling_ratio/mean": 1.000160813331604, + "sampling/importance_sampling_ratio/min": 0.5401555299758911, + "sampling/sampling_logp_difference/max": 0.6158981323242188, + "sampling/sampling_logp_difference/mean": 0.01715931110084057, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 194.984375, + "completions/mean_terminated_length": 194.984375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.44018101692199707, + "epoch": 0.3002450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021884061756286145, + "kl": 0.008384596556425095, + "learning_rate": 9.918699186991869e-07, + "loss": 0.0001, + "num_tokens": 7706681.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6211398839950562, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.6720573902130127, + "sampling/sampling_logp_difference/max": 0.48312950134277344, + "sampling/sampling_logp_difference/mean": 0.016173996031284332, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 221.828125, + "completions/mean_terminated_length": 221.828125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.44105350971221924, + "epoch": 0.3014705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016795578346263017, + "kl": 0.007627334911376238, + "learning_rate": 9.959349593495935e-07, + "loss": 0.0001, + "num_tokens": 7742446.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 0.9993383288383484, + "sampling/importance_sampling_ratio/min": 0.49609559774398804, + "sampling/sampling_logp_difference/max": 0.7009866237640381, + "sampling/sampling_logp_difference/mean": 0.017043959349393845, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 183.90625, + "completions/mean_terminated_length": 183.90625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.2399781346321106, + "epoch": 0.30269607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019973136134103113, + "kl": 0.006935018114745617, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 7769512.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5591720342636108, + "sampling/importance_sampling_ratio/mean": 0.9999405145645142, + "sampling/importance_sampling_ratio/min": 0.614763617515564, + "sampling/sampling_logp_difference/max": 0.48651742935180664, + "sampling/sampling_logp_difference/mean": 0.011417325586080551, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 149.71875, + "completions/mean_terminated_length": 149.71875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3416861891746521, + "epoch": 0.30392156862745096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026188917228080588, + "kl": 0.010590678080916405, + "learning_rate": 9.99999492515838e-07, + "loss": 0.0001, + "num_tokens": 7796614.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7297691106796265, + "sampling/importance_sampling_ratio/mean": 1.0001978874206543, + "sampling/importance_sampling_ratio/min": 0.617793619632721, + "sampling/sampling_logp_difference/max": 0.5479879379272461, + "sampling/sampling_logp_difference/mean": 0.016203757375478745, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 191.640625, + "completions/mean_terminated_length": 191.640625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.449499249458313, + "epoch": 0.30514705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9278129375037183, + "kl": 0.013040348887443542, + "learning_rate": 9.99997970064382e-07, + "loss": -0.0467, + "num_tokens": 7829743.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6324667930603027, + "sampling/importance_sampling_ratio/mean": 0.9997720718383789, + "sampling/importance_sampling_ratio/min": 0.2991386353969574, + "sampling/sampling_logp_difference/max": 1.20684814453125, + "sampling/sampling_logp_difference/mean": 0.017926493659615517, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 163.578125, + "completions/mean_terminated_length": 163.578125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.41771259903907776, + "epoch": 0.30637254901960786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06760607057143501, + "kl": 0.014883172698318958, + "learning_rate": 9.999954326487227e-07, + "loss": 0.0001, + "num_tokens": 7855284.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.506048321723938, + "sampling/importance_sampling_ratio/mean": 0.9999994039535522, + "sampling/importance_sampling_ratio/min": 0.6286977529525757, + "sampling/sampling_logp_difference/max": 0.46410465240478516, + "sampling/sampling_logp_difference/mean": 0.018769796937704086, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 158.5, + "completions/mean_terminated_length": 158.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3097831606864929, + "epoch": 0.3075980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0385912065717164, + "kl": 0.011668984778225422, + "learning_rate": 9.999918802740106e-07, + "loss": 0.0086, + "num_tokens": 7877812.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.523362636566162, + "sampling/importance_sampling_ratio/mean": 1.0003794431686401, + "sampling/importance_sampling_ratio/min": 0.6056217551231384, + "sampling/sampling_logp_difference/max": 0.5014996528625488, + "sampling/sampling_logp_difference/mean": 0.015177415683865547, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 143.0, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.3114874064922333, + "epoch": 0.3088235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03592110566504937, + "kl": 0.013739445246756077, + "learning_rate": 9.999873129474573e-07, + "loss": 0.0001, + "num_tokens": 7906372.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4397625923156738, + "sampling/importance_sampling_ratio/mean": 0.9998765587806702, + "sampling/importance_sampling_ratio/min": 0.6155447959899902, + "sampling/sampling_logp_difference/max": 0.4852476119995117, + "sampling/sampling_logp_difference/mean": 0.014856807887554169, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 179.921875, + "completions/mean_terminated_length": 179.921875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4216104745864868, + "epoch": 0.31004901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9192886737372505, + "kl": 0.015545014292001724, + "learning_rate": 9.999817306783336e-07, + "loss": -0.009, + "num_tokens": 7932975.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.463365912437439, + "sampling/importance_sampling_ratio/mean": 1.0002342462539673, + "sampling/importance_sampling_ratio/min": 0.6248787641525269, + "sampling/sampling_logp_difference/max": 0.4701976776123047, + "sampling/sampling_logp_difference/mean": 0.017898280173540115, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 147.03125, + "completions/mean_terminated_length": 147.03125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3145422339439392, + "epoch": 0.3112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03524478669237915, + "kl": 0.013532701879739761, + "learning_rate": 9.999751334779714e-07, + "loss": 0.0001, + "num_tokens": 7956961.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.580665111541748, + "sampling/importance_sampling_ratio/mean": 1.0000933408737183, + "sampling/importance_sampling_ratio/min": 0.6483817100524902, + "sampling/sampling_logp_difference/max": 0.45784568786621094, + "sampling/sampling_logp_difference/mean": 0.015270882286131382, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 175.578125, + "completions/mean_terminated_length": 175.578125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.375484824180603, + "epoch": 0.3125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031741820057699044, + "kl": 0.011409718543291092, + "learning_rate": 9.999675213597626e-07, + "loss": 0.0001, + "num_tokens": 7987542.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.524396300315857, + "sampling/importance_sampling_ratio/mean": 1.0004853010177612, + "sampling/importance_sampling_ratio/min": 0.6115931272506714, + "sampling/sampling_logp_difference/max": 0.4916880130767822, + "sampling/sampling_logp_difference/mean": 0.016877297312021255, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 189.0625, + "completions/mean_terminated_length": 189.0625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3867337703704834, + "epoch": 0.3137254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05021085968000162, + "kl": 0.012918060645461082, + "learning_rate": 9.999588943391595e-07, + "loss": 0.0001, + "num_tokens": 8017594.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.417964220046997, + "sampling/importance_sampling_ratio/mean": 0.9997158050537109, + "sampling/importance_sampling_ratio/min": 0.6509522795677185, + "sampling/sampling_logp_difference/max": 0.429318904876709, + "sampling/sampling_logp_difference/mean": 0.017811531201004982, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 145.34375, + "completions/mean_terminated_length": 145.34375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3366623818874359, + "epoch": 0.31495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8153839531646045, + "kl": 0.013880794867873192, + "learning_rate": 9.999492524336742e-07, + "loss": -0.0117, + "num_tokens": 8041200.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6287307739257812, + "sampling/importance_sampling_ratio/mean": 0.9995911717414856, + "sampling/importance_sampling_ratio/min": 0.6185733675956726, + "sampling/sampling_logp_difference/max": 0.48780107498168945, + "sampling/sampling_logp_difference/mean": 0.016190677881240845, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 166.921875, + "completions/mean_terminated_length": 166.921875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3677437901496887, + "epoch": 0.3161764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04122863649489965, + "kl": 0.015979524701833725, + "learning_rate": 9.999385956628792e-07, + "loss": 0.0002, + "num_tokens": 8066907.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4311892986297607, + "sampling/importance_sampling_ratio/mean": 1.000390887260437, + "sampling/importance_sampling_ratio/min": 0.6117782592773438, + "sampling/sampling_logp_difference/max": 0.49138545989990234, + "sampling/sampling_logp_difference/mean": 0.018296608701348305, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 196.4375, + "completions/mean_terminated_length": 196.4375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3097396492958069, + "epoch": 0.3174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.069374312163895, + "kl": 0.011495886370539665, + "learning_rate": 9.999269240484069e-07, + "loss": -0.0209, + "num_tokens": 8097335.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.633513331413269, + "sampling/importance_sampling_ratio/mean": 1.0004537105560303, + "sampling/importance_sampling_ratio/min": 0.6408920288085938, + "sampling/sampling_logp_difference/max": 0.49073314666748047, + "sampling/sampling_logp_difference/mean": 0.014788438566029072, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 190.1875, + "completions/mean_terminated_length": 190.1875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.35195496678352356, + "epoch": 0.31862745098039214, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.024392022286009, + "kl": 0.013708166778087616, + "learning_rate": 9.999142376139503e-07, + "loss": -0.015, + "num_tokens": 8129875.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5133965015411377, + "sampling/importance_sampling_ratio/mean": 0.9998887777328491, + "sampling/importance_sampling_ratio/min": 0.6070231199264526, + "sampling/sampling_logp_difference/max": 0.4991884231567383, + "sampling/sampling_logp_difference/mean": 0.016092892736196518, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 157.921875, + "completions/mean_terminated_length": 157.921875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.309036523103714, + "epoch": 0.31985294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05828600558040216, + "kl": 0.016801748424768448, + "learning_rate": 9.999005363852617e-07, + "loss": 0.0002, + "num_tokens": 8155838.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6100053787231445, + "sampling/importance_sampling_ratio/mean": 0.999739944934845, + "sampling/importance_sampling_ratio/min": 0.6941452622413635, + "sampling/sampling_logp_difference/max": 0.47623753547668457, + "sampling/sampling_logp_difference/mean": 0.014917733147740364, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 237.421875, + "completions/mean_terminated_length": 237.421875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.29509255290031433, + "epoch": 0.32107843137254904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02039980450526182, + "kl": 0.008284160867333412, + "learning_rate": 9.99885820390154e-07, + "loss": 0.0001, + "num_tokens": 8190153.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.425432801246643, + "sampling/importance_sampling_ratio/mean": 1.000622034072876, + "sampling/importance_sampling_ratio/min": 0.6363792419433594, + "sampling/sampling_logp_difference/max": 0.45196056365966797, + "sampling/sampling_logp_difference/mean": 0.01347966305911541, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 194.359375, + "completions/mean_terminated_length": 194.359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.42587071657180786, + "epoch": 0.32230392156862747, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0083109275275735, + "kl": 0.010584939271211624, + "learning_rate": 9.998700896584995e-07, + "loss": -0.0051, + "num_tokens": 8222880.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4777371883392334, + "sampling/importance_sampling_ratio/mean": 0.9997612833976746, + "sampling/importance_sampling_ratio/min": 0.606777012348175, + "sampling/sampling_logp_difference/max": 0.49959397315979004, + "sampling/sampling_logp_difference/mean": 0.016714511439204216, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 253.375, + "completions/mean_terminated_length": 253.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3739390969276428, + "epoch": 0.3235294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01750328991956531, + "kl": 0.007153388112783432, + "learning_rate": 9.998533442222308e-07, + "loss": 0.0001, + "num_tokens": 8258328.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6946955919265747, + "sampling/importance_sampling_ratio/mean": 0.9997098445892334, + "sampling/importance_sampling_ratio/min": 0.6196833848953247, + "sampling/sampling_logp_difference/max": 0.5275031328201294, + "sampling/sampling_logp_difference/mean": 0.015847668051719666, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 212.203125, + "completions/mean_terminated_length": 212.203125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3422619104385376, + "epoch": 0.3247549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1888208198494439, + "kl": 0.012853987514972687, + "learning_rate": 9.9983558411534e-07, + "loss": -0.0012, + "num_tokens": 8287157.0, + "reward": 0.65625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.363533854484558, + "sampling/importance_sampling_ratio/mean": 1.0000767707824707, + "sampling/importance_sampling_ratio/min": 0.6381403803825378, + "sampling/sampling_logp_difference/max": 0.44919705390930176, + "sampling/sampling_logp_difference/mean": 0.015186481177806854, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 236.4375, + "completions/mean_terminated_length": 236.4375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.36014288663864136, + "epoch": 0.32598039215686275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021077412144935612, + "kl": 0.009041134268045425, + "learning_rate": 9.99816809373879e-07, + "loss": 0.0001, + "num_tokens": 8322577.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4862349033355713, + "sampling/importance_sampling_ratio/mean": 0.9995858669281006, + "sampling/importance_sampling_ratio/min": 0.6177365183830261, + "sampling/sampling_logp_difference/max": 0.4816932678222656, + "sampling/sampling_logp_difference/mean": 0.01599959097802639, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 237.65625, + "completions/mean_terminated_length": 237.65625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4498145282268524, + "epoch": 0.3272058823529412, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1473118671642435, + "kl": 0.011207411997020245, + "learning_rate": 9.99797020035959e-07, + "loss": -0.0174, + "num_tokens": 8357211.0, + "reward": 0.34375, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.653535008430481, + "sampling/importance_sampling_ratio/mean": 1.0001318454742432, + "sampling/importance_sampling_ratio/min": 0.6112035512924194, + "sampling/sampling_logp_difference/max": 0.5029153823852539, + "sampling/sampling_logp_difference/mean": 0.01675787940621376, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 213.140625, + "completions/mean_terminated_length": 213.140625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.30425506830215454, + "epoch": 0.3284313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0320770130593047, + "kl": 0.01154828630387783, + "learning_rate": 9.997762161417517e-07, + "loss": 0.0001, + "num_tokens": 8388548.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.530159831047058, + "sampling/importance_sampling_ratio/mean": 1.0002944469451904, + "sampling/importance_sampling_ratio/min": 0.6133034825325012, + "sampling/sampling_logp_difference/max": 0.4888954162597656, + "sampling/sampling_logp_difference/mean": 0.015344790183007717, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 241.640625, + "completions/mean_terminated_length": 241.640625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3790775537490845, + "epoch": 0.32965686274509803, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.93327834126621, + "kl": 0.009812450036406517, + "learning_rate": 9.997543977334873e-07, + "loss": -0.0168, + "num_tokens": 8429437.0, + "reward": -0.25, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.9876813888549805, + "sampling/importance_sampling_ratio/mean": 1.0003782510757446, + "sampling/importance_sampling_ratio/min": 0.7020878195762634, + "sampling/sampling_logp_difference/max": 0.6869688034057617, + "sampling/sampling_logp_difference/mean": 0.014591362327337265, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 206.484375, + "completions/mean_terminated_length": 206.484375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.38758742809295654, + "epoch": 0.33088235294117646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026128193846614904, + "kl": 0.010831797495484352, + "learning_rate": 9.99731564855456e-07, + "loss": 0.0001, + "num_tokens": 8459324.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5065176486968994, + "sampling/importance_sampling_ratio/mean": 0.9998651146888733, + "sampling/importance_sampling_ratio/min": 0.6188350319862366, + "sampling/sampling_logp_difference/max": 0.4799165725708008, + "sampling/sampling_logp_difference/mean": 0.01775408536195755, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 278.890625, + "completions/mean_terminated_length": 278.890625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2929910123348236, + "epoch": 0.3321078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019622187422360302, + "kl": 0.011361843906342983, + "learning_rate": 9.997077175540066e-07, + "loss": 0.0001, + "num_tokens": 8496661.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.534928798675537, + "sampling/importance_sampling_ratio/mean": 1.00014066696167, + "sampling/importance_sampling_ratio/min": 0.7192728519439697, + "sampling/sampling_logp_difference/max": 0.4284839630126953, + "sampling/sampling_logp_difference/mean": 0.012748262844979763, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 210.03125, + "completions/mean_terminated_length": 210.03125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3396623134613037, + "epoch": 0.3333333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8073554666654875, + "kl": 0.013835672289133072, + "learning_rate": 9.996828558775485e-07, + "loss": -0.014, + "num_tokens": 8531031.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.9684661626815796, + "sampling/importance_sampling_ratio/mean": 0.9999529123306274, + "sampling/importance_sampling_ratio/min": 0.5509454011917114, + "sampling/sampling_logp_difference/max": 0.6772546768188477, + "sampling/sampling_logp_difference/mean": 0.014482136815786362, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 223.0, + "completions/mean_terminated_length": 223.0, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.35685327649116516, + "epoch": 0.33455882352941174, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1293923135993438, + "kl": 0.01976989209651947, + "learning_rate": 9.996569798765487e-07, + "loss": -0.0089, + "num_tokens": 8560023.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5511717796325684, + "sampling/importance_sampling_ratio/mean": 1.0003318786621094, + "sampling/importance_sampling_ratio/min": 0.6207550168037415, + "sampling/sampling_logp_difference/max": 0.4768187999725342, + "sampling/sampling_logp_difference/mean": 0.014719195663928986, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 202.34375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3357786536216736, + "epoch": 0.33578431372549017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04071909819670399, + "kl": 0.01715000718832016, + "learning_rate": 9.996300896035338e-07, + "loss": 0.0002, + "num_tokens": 8587821.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4186033010482788, + "sampling/importance_sampling_ratio/mean": 0.9999063014984131, + "sampling/importance_sampling_ratio/min": 0.618531346321106, + "sampling/sampling_logp_difference/max": 0.4804074764251709, + "sampling/sampling_logp_difference/mean": 0.015081456862390041, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 264.359375, + "completions/mean_terminated_length": 264.359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.36299800872802734, + "epoch": 0.33700980392156865, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7979230304488981, + "kl": 0.014299536123871803, + "learning_rate": 9.996021851130896e-07, + "loss": 0.0178, + "num_tokens": 8622788.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6541705131530762, + "sampling/importance_sampling_ratio/mean": 1.0003371238708496, + "sampling/importance_sampling_ratio/min": 0.6095863580703735, + "sampling/sampling_logp_difference/max": 0.5032997131347656, + "sampling/sampling_logp_difference/mean": 0.014425665140151978, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 259.359375, + "completions/mean_terminated_length": 259.359375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.37495720386505127, + "epoch": 0.3382352941176471, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0490685489830045, + "kl": 0.014999349601566792, + "learning_rate": 9.995732664618603e-07, + "loss": 0.1337, + "num_tokens": 8671851.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6221331357955933, + "sampling/importance_sampling_ratio/mean": 1.000325322151184, + "sampling/importance_sampling_ratio/min": 0.38583245873451233, + "sampling/sampling_logp_difference/max": 0.9523520469665527, + "sampling/sampling_logp_difference/mean": 0.015569565817713737, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 270.921875, + "completions/mean_terminated_length": 270.921875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.39126330614089966, + "epoch": 0.3394607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6069572212361701, + "kl": 0.019539829343557358, + "learning_rate": 9.99543333708549e-07, + "loss": -0.0189, + "num_tokens": 8706006.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.4988577365875244, + "sampling/importance_sampling_ratio/mean": 0.9995763897895813, + "sampling/importance_sampling_ratio/min": 0.6227314472198486, + "sampling/sampling_logp_difference/max": 0.47363996505737305, + "sampling/sampling_logp_difference/mean": 0.015419903211295605, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 194.4375, + "completions/mean_terminated_length": 194.4375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.37649017572402954, + "epoch": 0.34068627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053443642162347454, + "kl": 0.02782365307211876, + "learning_rate": 9.995123869139176e-07, + "loss": 0.0003, + "num_tokens": 8731874.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4893184900283813, + "sampling/importance_sampling_ratio/mean": 1.0006440877914429, + "sampling/importance_sampling_ratio/min": 0.6645188331604004, + "sampling/sampling_logp_difference/max": 0.4086921215057373, + "sampling/sampling_logp_difference/mean": 0.01722247712314129, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 284.625, + "completions/mean_terminated_length": 284.625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.5158754587173462, + "epoch": 0.34191176470588236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9434177905899298, + "kl": 0.015656255185604095, + "learning_rate": 9.994804261407854e-07, + "loss": 0.0044, + "num_tokens": 8775930.0, + "reward": 0.625, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6037347316741943, + "sampling/importance_sampling_ratio/mean": 1.0002131462097168, + "sampling/importance_sampling_ratio/min": 0.6202828288078308, + "sampling/sampling_logp_difference/max": 0.47757983207702637, + "sampling/sampling_logp_difference/mean": 0.017696373164653778, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 174.8125, + "completions/mean_terminated_length": 174.8125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4062265157699585, + "epoch": 0.3431372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8061154001343366, + "kl": 0.025968432426452637, + "learning_rate": 9.994474514540312e-07, + "loss": -0.0021, + "num_tokens": 8810638.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6301919221878052, + "sampling/importance_sampling_ratio/mean": 0.9998388886451721, + "sampling/importance_sampling_ratio/min": 0.7160124182701111, + "sampling/sampling_logp_difference/max": 0.48869776725769043, + "sampling/sampling_logp_difference/mean": 0.015875108540058136, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 251.265625, + "completions/mean_terminated_length": 251.265625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.3788436949253082, + "epoch": 0.3443627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0342866177651477, + "kl": 0.020827017724514008, + "learning_rate": 9.994134629205917e-07, + "loss": 0.0002, + "num_tokens": 8843807.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006530284881592, + "sampling/importance_sampling_ratio/min": 0.607172966003418, + "sampling/sampling_logp_difference/max": 0.7796788215637207, + "sampling/sampling_logp_difference/mean": 0.015283560380339622, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 238.96875, + "completions/mean_terminated_length": 238.96875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3100021779537201, + "epoch": 0.34558823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0346732415875164, + "kl": 0.021007856354117393, + "learning_rate": 9.99378460609461e-07, + "loss": 0.0002, + "num_tokens": 8873645.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3672375679016113, + "sampling/importance_sampling_ratio/mean": 1.0003538131713867, + "sampling/importance_sampling_ratio/min": 0.6927396655082703, + "sampling/sampling_logp_difference/max": 0.36710095405578613, + "sampling/sampling_logp_difference/mean": 0.013294361531734467, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 267.90625, + "completions/mean_terminated_length": 267.90625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.5047875642776489, + "epoch": 0.34681372549019607, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0632033845726123, + "kl": 0.021810028702020645, + "learning_rate": 9.993424445916922e-07, + "loss": -0.0005, + "num_tokens": 8908199.0, + "reward": -0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.495995283126831, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.5697722434997559, + "sampling/sampling_logp_difference/max": 0.5625185966491699, + "sampling/sampling_logp_difference/mean": 0.017674528062343597, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 206.4375, + "completions/mean_terminated_length": 206.4375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.38155049085617065, + "epoch": 0.3480392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04724550668822444, + "kl": 0.023773398250341415, + "learning_rate": 9.993054149403949e-07, + "loss": 0.0002, + "num_tokens": 8938131.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.451388955116272, + "sampling/importance_sampling_ratio/mean": 0.9998237490653992, + "sampling/importance_sampling_ratio/min": 0.6486294269561768, + "sampling/sampling_logp_difference/max": 0.4328937530517578, + "sampling/sampling_logp_difference/mean": 0.01579119637608528, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 225.71875, + "completions/mean_terminated_length": 225.71875, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.41737085580825806, + "epoch": 0.3492647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7651160195940809, + "kl": 0.020508062094449997, + "learning_rate": 9.992673717307372e-07, + "loss": 0.0186, + "num_tokens": 8969329.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4221943616867065, + "sampling/importance_sampling_ratio/mean": 1.000159502029419, + "sampling/importance_sampling_ratio/min": 0.6715134382247925, + "sampling/sampling_logp_difference/max": 0.3982212543487549, + "sampling/sampling_logp_difference/mean": 0.01604665070772171, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 335.9375, + "completions/mean_terminated_length": 335.9375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.4537578523159027, + "epoch": 0.35049019607843135, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9120153238112306, + "kl": 0.016356246545910835, + "learning_rate": 9.992283150399446e-07, + "loss": 0.0165, + "num_tokens": 9010605.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.4032405614852905, + "sampling/importance_sampling_ratio/mean": 1.000108003616333, + "sampling/importance_sampling_ratio/min": 0.623317301273346, + "sampling/sampling_logp_difference/max": 0.4726996421813965, + "sampling/sampling_logp_difference/mean": 0.015053913928568363, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 206.34375, + "completions/mean_terminated_length": 206.34375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3335433602333069, + "epoch": 0.35171568627450983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04277451366755491, + "kl": 0.027477117255330086, + "learning_rate": 9.991882449472994e-07, + "loss": 0.0003, + "num_tokens": 9037699.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4695838689804077, + "sampling/importance_sampling_ratio/mean": 1.0002992153167725, + "sampling/importance_sampling_ratio/min": 0.6815792918205261, + "sampling/sampling_logp_difference/max": 0.384979248046875, + "sampling/sampling_logp_difference/mean": 0.014059789478778839, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 267.671875, + "completions/mean_terminated_length": 267.671875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.3319460451602936, + "epoch": 0.35294117647058826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03102749230494356, + "kl": 0.021260742098093033, + "learning_rate": 9.991471615341415e-07, + "loss": 0.0002, + "num_tokens": 9073310.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4058274030685425, + "sampling/importance_sampling_ratio/mean": 1.0000005960464478, + "sampling/importance_sampling_ratio/min": 0.722834050655365, + "sampling/sampling_logp_difference/max": 0.3406260013580322, + "sampling/sampling_logp_difference/mean": 0.012393254786729813, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.4477394223213196, + "epoch": 0.3541666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.702509219757032, + "kl": 0.020046524703502655, + "learning_rate": 9.991050648838675e-07, + "loss": 0.0115, + "num_tokens": 9116958.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6007344722747803, + "sampling/importance_sampling_ratio/mean": 1.0000159740447998, + "sampling/importance_sampling_ratio/min": 0.6227967143058777, + "sampling/sampling_logp_difference/max": 0.47353506088256836, + "sampling/sampling_logp_difference/mean": 0.015253344550728798, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 261.078125, + "completions/mean_terminated_length": 261.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.5179446935653687, + "epoch": 0.3553921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.245212156846121, + "kl": 0.02527690678834915, + "learning_rate": 9.990619550819312e-07, + "loss": -0.0282, + "num_tokens": 9154323.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.2850115299224854, + "sampling/importance_sampling_ratio/mean": 1.0000402927398682, + "sampling/importance_sampling_ratio/min": 0.6368844509124756, + "sampling/sampling_logp_difference/max": 0.45116710662841797, + "sampling/sampling_logp_difference/mean": 0.016047444194555283, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 226.859375, + "completions/mean_terminated_length": 226.859375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.42619866132736206, + "epoch": 0.35661764705882354, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9808054354022362, + "kl": 0.035265617072582245, + "learning_rate": 9.990178322158424e-07, + "loss": 0.011, + "num_tokens": 9184938.0, + "reward": 0.65625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.435062289237976, + "sampling/importance_sampling_ratio/mean": 1.000767707824707, + "sampling/importance_sampling_ratio/min": 0.6609618663787842, + "sampling/sampling_logp_difference/max": 0.4140591621398926, + "sampling/sampling_logp_difference/mean": 0.015793394297361374, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 265.328125, + "completions/mean_terminated_length": 265.328125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.4519370198249817, + "epoch": 0.35784313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8398408474879482, + "kl": 0.024876803159713745, + "learning_rate": 9.989726963751682e-07, + "loss": -0.0675, + "num_tokens": 9223935.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 1.0003069639205933, + "sampling/importance_sampling_ratio/min": 0.681201696395874, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.014846328645944595, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 280.984375, + "completions/mean_terminated_length": 280.984375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3856172263622284, + "epoch": 0.3590686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9592771356076676, + "kl": 0.02183361165225506, + "learning_rate": 9.989265476515309e-07, + "loss": 0.0004, + "num_tokens": 9261886.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6392227411270142, + "sampling/importance_sampling_ratio/mean": 0.9999226927757263, + "sampling/importance_sampling_ratio/min": 0.5483723878860474, + "sampling/sampling_logp_difference/max": 0.600800633430481, + "sampling/sampling_logp_difference/mean": 0.012838400900363922, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 268.109375, + "completions/mean_terminated_length": 268.109375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.3931152820587158, + "epoch": 0.3602941176470588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041518879366459226, + "kl": 0.026410872116684914, + "learning_rate": 9.9887938613861e-07, + "loss": 0.0002, + "num_tokens": 9300501.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4378784894943237, + "sampling/importance_sampling_ratio/mean": 1.0005898475646973, + "sampling/importance_sampling_ratio/min": 0.5106711983680725, + "sampling/sampling_logp_difference/max": 0.6720293760299683, + "sampling/sampling_logp_difference/mean": 0.01435445062816143, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 246.84375, + "completions/mean_terminated_length": 246.84375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.5600332021713257, + "epoch": 0.36151960784313725, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0508689845863735, + "kl": 0.02834884449839592, + "learning_rate": 9.988312119321402e-07, + "loss": 0.0104, + "num_tokens": 9331275.0, + "reward": 0.5, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4296252727508545, + "sampling/importance_sampling_ratio/mean": 1.000156283378601, + "sampling/importance_sampling_ratio/min": 0.6961706280708313, + "sampling/sampling_logp_difference/max": 0.36216044425964355, + "sampling/sampling_logp_difference/mean": 0.01765412464737892, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 236.4375, + "completions/mean_terminated_length": 236.4375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.5023055076599121, + "epoch": 0.3627450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0406992424663923, + "kl": 0.02572363056242466, + "learning_rate": 9.98782025129912e-07, + "loss": 0.0237, + "num_tokens": 9363319.0, + "reward": -0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4045171737670898, + "sampling/importance_sampling_ratio/mean": 1.0002970695495605, + "sampling/importance_sampling_ratio/min": 0.6565462946891785, + "sampling/sampling_logp_difference/max": 0.4207620620727539, + "sampling/sampling_logp_difference/mean": 0.016197221353650093, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 244.09375, + "completions/mean_terminated_length": 244.09375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3690158724784851, + "epoch": 0.3639705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1297929680290169, + "kl": 0.02254669740796089, + "learning_rate": 9.987318258317715e-07, + "loss": -0.0009, + "num_tokens": 9394173.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5062799453735352, + "sampling/importance_sampling_ratio/mean": 0.9997458457946777, + "sampling/importance_sampling_ratio/min": 0.48347559571266174, + "sampling/sampling_logp_difference/max": 0.7267544269561768, + "sampling/sampling_logp_difference/mean": 0.013869008049368858, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 214.109375, + "completions/mean_terminated_length": 214.109375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.4830111861228943, + "epoch": 0.36519607843137253, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1628297212132652, + "kl": 0.02464178390800953, + "learning_rate": 9.986806141396205e-07, + "loss": 0.0074, + "num_tokens": 9425220.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3627346754074097, + "sampling/importance_sampling_ratio/mean": 1.0003234148025513, + "sampling/importance_sampling_ratio/min": 0.6627572178840637, + "sampling/sampling_logp_difference/max": 0.41134655475616455, + "sampling/sampling_logp_difference/mean": 0.016353249549865723, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 199.421875, + "completions/mean_terminated_length": 199.421875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.45876309275627136, + "epoch": 0.36642156862745096, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9559940374465532, + "kl": 0.02711273357272148, + "learning_rate": 9.986283901574149e-07, + "loss": 0.0228, + "num_tokens": 9454127.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.8181120157241821, + "sampling/importance_sampling_ratio/mean": 1.000248908996582, + "sampling/importance_sampling_ratio/min": 0.6377933621406555, + "sampling/sampling_logp_difference/max": 0.5977985858917236, + "sampling/sampling_logp_difference/mean": 0.016439270228147507, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 205.0625, + "completions/mean_terminated_length": 205.0625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.446953684091568, + "epoch": 0.36764705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9389807093353264, + "kl": 0.02266453579068184, + "learning_rate": 9.985751539911664e-07, + "loss": -0.0135, + "num_tokens": 9485459.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5480645895004272, + "sampling/importance_sampling_ratio/mean": 1.0001177787780762, + "sampling/importance_sampling_ratio/min": 0.7234283089637756, + "sampling/sampling_logp_difference/max": 0.43700551986694336, + "sampling/sampling_logp_difference/mean": 0.015539245679974556, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 241.84375, + "completions/mean_terminated_length": 241.84375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.4896894693374634, + "epoch": 0.36887254901960786, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2370543993976322, + "kl": 0.017642877995967865, + "learning_rate": 9.985209057489408e-07, + "loss": 0.0249, + "num_tokens": 9519673.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.608892560005188, + "sampling/importance_sampling_ratio/mean": 1.0001373291015625, + "sampling/importance_sampling_ratio/min": 0.6986686587333679, + "sampling/sampling_logp_difference/max": 0.47554612159729004, + "sampling/sampling_logp_difference/mean": 0.01651783287525177, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 209.046875, + "completions/mean_terminated_length": 209.046875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.45735979080200195, + "epoch": 0.3700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.933172548990412, + "kl": 0.020198900252580643, + "learning_rate": 9.98465645540859e-07, + "loss": -0.0088, + "num_tokens": 9549532.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.413565993309021, + "sampling/importance_sampling_ratio/mean": 0.9999147057533264, + "sampling/importance_sampling_ratio/min": 0.6831852793693542, + "sampling/sampling_logp_difference/max": 0.3809892237186432, + "sampling/sampling_logp_difference/mean": 0.01622258871793747, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 214.171875, + "completions/mean_terminated_length": 214.171875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.45174339413642883, + "epoch": 0.3713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02551175006893155, + "kl": 0.0194728821516037, + "learning_rate": 9.984093734790954e-07, + "loss": 0.0002, + "num_tokens": 9583463.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.418564796447754, + "sampling/importance_sampling_ratio/mean": 0.9997199773788452, + "sampling/importance_sampling_ratio/min": 0.662344753742218, + "sampling/sampling_logp_difference/max": 0.41196906566619873, + "sampling/sampling_logp_difference/mean": 0.016074877232313156, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 216.8125, + "completions/mean_terminated_length": 216.8125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4414713978767395, + "epoch": 0.37254901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3309676504186365, + "kl": 0.015547841787338257, + "learning_rate": 9.983520896778788e-07, + "loss": 0.0445, + "num_tokens": 9622219.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.639810562133789, + "sampling/importance_sampling_ratio/mean": 1.0012476444244385, + "sampling/importance_sampling_ratio/min": 0.49714693427085876, + "sampling/sampling_logp_difference/max": 0.6988697052001953, + "sampling/sampling_logp_difference/mean": 0.016059590503573418, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 187.75, + "completions/mean_terminated_length": 187.75, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4350166916847229, + "epoch": 0.3737745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2038721713622835, + "kl": 0.020988553762435913, + "learning_rate": 9.982937942534917e-07, + "loss": -0.0323, + "num_tokens": 9650875.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5613172054290771, + "sampling/importance_sampling_ratio/mean": 1.0004253387451172, + "sampling/importance_sampling_ratio/min": 0.7304356694221497, + "sampling/sampling_logp_difference/max": 0.4455298185348511, + "sampling/sampling_logp_difference/mean": 0.01565871387720108, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 186.765625, + "completions/mean_terminated_length": 186.765625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.36607080698013306, + "epoch": 0.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02285386951927824, + "kl": 0.01576017215847969, + "learning_rate": 9.982344873242701e-07, + "loss": 0.0001, + "num_tokens": 9678460.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003432035446167, + "sampling/importance_sampling_ratio/min": 0.5685401558876038, + "sampling/sampling_logp_difference/max": 0.7368893623352051, + "sampling/sampling_logp_difference/mean": 0.015342392027378082, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 187.296875, + "completions/mean_terminated_length": 187.296875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.36356499791145325, + "epoch": 0.3762254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023063440600103766, + "kl": 0.016244564205408096, + "learning_rate": 9.981741690106034e-07, + "loss": 0.0002, + "num_tokens": 9709743.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8790457248687744, + "sampling/importance_sampling_ratio/mean": 0.9994124174118042, + "sampling/importance_sampling_ratio/min": 0.6109957695007324, + "sampling/sampling_logp_difference/max": 0.6307640075683594, + "sampling/sampling_logp_difference/mean": 0.015056891366839409, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 212.359375, + "completions/mean_terminated_length": 212.359375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.32220736145973206, + "epoch": 0.37745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015985507237566882, + "kl": 0.013575403019785881, + "learning_rate": 9.981128394349337e-07, + "loss": 0.0001, + "num_tokens": 9740470.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4561007022857666, + "sampling/importance_sampling_ratio/mean": 0.9995553493499756, + "sampling/importance_sampling_ratio/min": 0.6257768273353577, + "sampling/sampling_logp_difference/max": 0.4687614440917969, + "sampling/sampling_logp_difference/mean": 0.01345759630203247, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 188.265625, + "completions/mean_terminated_length": 188.265625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.37388908863067627, + "epoch": 0.3786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02378861837544395, + "kl": 0.018611183390021324, + "learning_rate": 9.980504987217566e-07, + "loss": 0.0002, + "num_tokens": 9767383.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5393632650375366, + "sampling/importance_sampling_ratio/mean": 0.9997674226760864, + "sampling/importance_sampling_ratio/min": 0.48920756578445435, + "sampling/sampling_logp_difference/max": 0.7149684429168701, + "sampling/sampling_logp_difference/mean": 0.01607775315642357, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 225.015625, + "completions/mean_terminated_length": 225.015625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4121472239494324, + "epoch": 0.3799019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018721720571636468, + "kl": 0.018510695546865463, + "learning_rate": 9.979871469976195e-07, + "loss": 0.0002, + "num_tokens": 9803384.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5797837972640991, + "sampling/importance_sampling_ratio/mean": 1.0007458925247192, + "sampling/importance_sampling_ratio/min": 0.6805833578109741, + "sampling/sampling_logp_difference/max": 0.4572880268096924, + "sampling/sampling_logp_difference/mean": 0.015674971044063568, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 181.40625, + "completions/mean_terminated_length": 181.40625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.42017343640327454, + "epoch": 0.38112745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02361696383934872, + "kl": 0.017823034897446632, + "learning_rate": 9.979227843911224e-07, + "loss": 0.0002, + "num_tokens": 9837234.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004793405532837, + "sampling/importance_sampling_ratio/min": 0.6095881462097168, + "sampling/sampling_logp_difference/max": 0.8857212066650391, + "sampling/sampling_logp_difference/mean": 0.01524389162659645, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 222.171875, + "completions/mean_terminated_length": 222.171875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3985661268234253, + "epoch": 0.38235294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019965078795185862, + "kl": 0.01637699082493782, + "learning_rate": 9.978574110329172e-07, + "loss": 0.0002, + "num_tokens": 9874541.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3650709390640259, + "sampling/importance_sampling_ratio/mean": 1.000051736831665, + "sampling/importance_sampling_ratio/min": 0.659122884273529, + "sampling/sampling_logp_difference/max": 0.41684532165527344, + "sampling/sampling_logp_difference/mean": 0.015085380524396896, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 222.25, + "completions/mean_terminated_length": 222.25, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.37935322523117065, + "epoch": 0.38357843137254904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01919782391360173, + "kl": 0.01602853089570999, + "learning_rate": 9.977910270557078e-07, + "loss": 0.0002, + "num_tokens": 9910253.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.44670832157135, + "sampling/importance_sampling_ratio/mean": 0.9998260736465454, + "sampling/importance_sampling_ratio/min": 0.6405990123748779, + "sampling/sampling_logp_difference/max": 0.44535160064697266, + "sampling/sampling_logp_difference/mean": 0.01509149931371212, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 184.734375, + "completions/mean_terminated_length": 184.734375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.41767391562461853, + "epoch": 0.38480392156862747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023667493417852292, + "kl": 0.02086430788040161, + "learning_rate": 9.977236325942497e-07, + "loss": 0.0002, + "num_tokens": 9942540.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5250335931777954, + "sampling/importance_sampling_ratio/mean": 0.9999814629554749, + "sampling/importance_sampling_ratio/min": 0.6375662684440613, + "sampling/sampling_logp_difference/max": 0.45009708404541016, + "sampling/sampling_logp_difference/mean": 0.015893224626779556, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 221.0, + "completions/mean_terminated_length": 221.0, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.431037575006485, + "epoch": 0.3860294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019758936015441354, + "kl": 0.017636163160204887, + "learning_rate": 9.97655227785349e-07, + "loss": 0.0002, + "num_tokens": 9973980.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6710313558578491, + "sampling/importance_sampling_ratio/mean": 1.0001165866851807, + "sampling/importance_sampling_ratio/min": 0.529052197933197, + "sampling/sampling_logp_difference/max": 0.6366682052612305, + "sampling/sampling_logp_difference/mean": 0.015019385144114494, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 200.671875, + "completions/mean_terminated_length": 200.671875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.39899998903274536, + "epoch": 0.3872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021624545982917426, + "kl": 0.018345296382904053, + "learning_rate": 9.975858127678633e-07, + "loss": 0.0002, + "num_tokens": 10006103.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5327506065368652, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.5874239802360535, + "sampling/sampling_logp_difference/max": 0.5320084095001221, + "sampling/sampling_logp_difference/mean": 0.014733761548995972, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 207.515625, + "completions/mean_terminated_length": 207.515625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.42619267106056213, + "epoch": 0.38848039215686275, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7446097893106156, + "kl": 0.021541431546211243, + "learning_rate": 9.975153876827007e-07, + "loss": 0.0084, + "num_tokens": 10037224.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4943957328796387, + "sampling/importance_sampling_ratio/mean": 1.000223159790039, + "sampling/importance_sampling_ratio/min": 0.5525302290916443, + "sampling/sampling_logp_difference/max": 0.5932471752166748, + "sampling/sampling_logp_difference/mean": 0.01602860540151596, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 179.375, + "completions/mean_terminated_length": 179.375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2825194001197815, + "epoch": 0.3897058823529412, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022026282512501064, + "kl": 0.016508134081959724, + "learning_rate": 9.974439526728196e-07, + "loss": 0.0002, + "num_tokens": 10067248.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3455226421356201, + "sampling/importance_sampling_ratio/mean": 0.9993760585784912, + "sampling/importance_sampling_ratio/min": 0.6548749804496765, + "sampling/sampling_logp_difference/max": 0.42331087589263916, + "sampling/sampling_logp_difference/mean": 0.012481987476348877, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 184.34375, + "completions/mean_terminated_length": 184.34375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.38290348649024963, + "epoch": 0.3909313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02523643531495567, + "kl": 0.019662799313664436, + "learning_rate": 9.973715078832286e-07, + "loss": 0.0002, + "num_tokens": 10095030.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3655370473861694, + "sampling/importance_sampling_ratio/mean": 1.0000557899475098, + "sampling/importance_sampling_ratio/min": 0.7608779668807983, + "sampling/sampling_logp_difference/max": 0.31154775619506836, + "sampling/sampling_logp_difference/mean": 0.014857003465294838, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 174.203125, + "completions/mean_terminated_length": 174.203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.333204448223114, + "epoch": 0.39215686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024831705119286045, + "kl": 0.025300707668066025, + "learning_rate": 9.97298053460986e-07, + "loss": 0.0002, + "num_tokens": 10125155.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.508152723312378, + "sampling/importance_sampling_ratio/mean": 1.000302791595459, + "sampling/importance_sampling_ratio/min": 0.3722725808620453, + "sampling/sampling_logp_difference/max": 0.9881290197372437, + "sampling/sampling_logp_difference/mean": 0.014936529099941254, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 149.25, + "completions/mean_terminated_length": 149.25, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2664256691932678, + "epoch": 0.39338235294117646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018332434252923264, + "kl": 0.01409104559570551, + "learning_rate": 9.972235895552e-07, + "loss": 0.0001, + "num_tokens": 10149203.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6294610500335693, + "sampling/importance_sampling_ratio/mean": 1.0005340576171875, + "sampling/importance_sampling_ratio/min": 0.744778037071228, + "sampling/sampling_logp_difference/max": 0.4882493019104004, + "sampling/sampling_logp_difference/mean": 0.013930534943938255, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 186.515625, + "completions/mean_terminated_length": 186.515625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.40586355328559875, + "epoch": 0.3946078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02064638541886519, + "kl": 0.020959127694368362, + "learning_rate": 9.971481163170269e-07, + "loss": 0.0002, + "num_tokens": 10182420.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4098576307296753, + "sampling/importance_sampling_ratio/mean": 0.9997353553771973, + "sampling/importance_sampling_ratio/min": 0.6208242774009705, + "sampling/sampling_logp_difference/max": 0.47670722007751465, + "sampling/sampling_logp_difference/mean": 0.015238940715789795, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 184.25, + "completions/mean_terminated_length": 184.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3228031396865845, + "epoch": 0.3958333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024632588296384848, + "kl": 0.018601976335048676, + "learning_rate": 9.97071633899673e-07, + "loss": 0.0002, + "num_tokens": 10209444.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6561650037765503, + "sampling/importance_sampling_ratio/mean": 1.0004572868347168, + "sampling/importance_sampling_ratio/min": 0.014291869476437569, + "sampling/sampling_logp_difference/max": 4.2480645179748535, + "sampling/sampling_logp_difference/mean": 0.014869332313537598, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 192.9375, + "completions/mean_terminated_length": 192.9375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.35080116987228394, + "epoch": 0.39705882352941174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024710164205598798, + "kl": 0.018571950495243073, + "learning_rate": 9.969941424583925e-07, + "loss": 0.0002, + "num_tokens": 10243040.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5731862783432007, + "sampling/importance_sampling_ratio/mean": 1.0000193119049072, + "sampling/importance_sampling_ratio/min": 0.6066892147064209, + "sampling/sampling_logp_difference/max": 0.4997386932373047, + "sampling/sampling_logp_difference/mean": 0.014886989258229733, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 165.265625, + "completions/mean_terminated_length": 165.265625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.34618815779685974, + "epoch": 0.39828431372549017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024235228506903773, + "kl": 0.020537182688713074, + "learning_rate": 9.969156421504887e-07, + "loss": 0.0002, + "num_tokens": 10272577.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9551204442977905, + "sampling/importance_sampling_ratio/mean": 1.0000271797180176, + "sampling/importance_sampling_ratio/min": 0.6173092722892761, + "sampling/sampling_logp_difference/max": 0.6704518795013428, + "sampling/sampling_logp_difference/mean": 0.016670338809490204, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 152.578125, + "completions/mean_terminated_length": 152.578125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.34937143325805664, + "epoch": 0.39950980392156865, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.309998561306225, + "kl": 0.036862075328826904, + "learning_rate": 9.968361331353116e-07, + "loss": 0.0184, + "num_tokens": 10296006.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.338853359222412, + "sampling/importance_sampling_ratio/mean": 1.0007468461990356, + "sampling/importance_sampling_ratio/min": 0.4755912721157074, + "sampling/sampling_logp_difference/max": 0.7431964874267578, + "sampling/sampling_logp_difference/mean": 0.015171946957707405, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 180.5625, + "completions/mean_terminated_length": 180.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3997516632080078, + "epoch": 0.4007352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03787665420392663, + "kl": 0.02181834913790226, + "learning_rate": 9.9675561557426e-07, + "loss": 0.0002, + "num_tokens": 10328042.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0001437664031982, + "sampling/importance_sampling_ratio/min": 0.7130023837089539, + "sampling/sampling_logp_difference/max": 0.42380309104919434, + "sampling/sampling_logp_difference/mean": 0.016936684027314186, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 187.078125, + "completions/mean_terminated_length": 187.078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.385425329208374, + "epoch": 0.4019607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026178763304403684, + "kl": 0.021473130211234093, + "learning_rate": 9.966740896307791e-07, + "loss": 0.0002, + "num_tokens": 10359887.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3876692056655884, + "sampling/importance_sampling_ratio/mean": 1.0007128715515137, + "sampling/importance_sampling_ratio/min": 0.7448554039001465, + "sampling/sampling_logp_difference/max": 0.3276255130767822, + "sampling/sampling_logp_difference/mean": 0.013736354187130928, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 176.765625, + "completions/mean_terminated_length": 176.765625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.30048856139183044, + "epoch": 0.40318627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021039406246671645, + "kl": 0.017125394195318222, + "learning_rate": 9.965915554703613e-07, + "loss": 0.0002, + "num_tokens": 10385312.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.470345139503479, + "sampling/importance_sampling_ratio/mean": 0.9999263882637024, + "sampling/importance_sampling_ratio/min": 0.6326866149902344, + "sampling/sampling_logp_difference/max": 0.457780122756958, + "sampling/sampling_logp_difference/mean": 0.0145049337297678, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 154.625, + "completions/mean_terminated_length": 154.625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3266918957233429, + "epoch": 0.40441176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022633059020672665, + "kl": 0.015120496973395348, + "learning_rate": 9.965080132605461e-07, + "loss": 0.0001, + "num_tokens": 10413112.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5924938917160034, + "sampling/importance_sampling_ratio/mean": 1.0009623765945435, + "sampling/importance_sampling_ratio/min": 0.6915599703788757, + "sampling/sampling_logp_difference/max": 0.4653012752532959, + "sampling/sampling_logp_difference/mean": 0.01513220090419054, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 191.328125, + "completions/mean_terminated_length": 191.328125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.40041443705558777, + "epoch": 0.4056372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030299461024198816, + "kl": 0.017881443724036217, + "learning_rate": 9.964234631709185e-07, + "loss": 0.0002, + "num_tokens": 10445837.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4754209518432617, + "sampling/importance_sampling_ratio/mean": 1.0003015995025635, + "sampling/importance_sampling_ratio/min": 0.6172264218330383, + "sampling/sampling_logp_difference/max": 0.48251938819885254, + "sampling/sampling_logp_difference/mean": 0.016025379300117493, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 196.875, + "completions/mean_terminated_length": 196.875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.30025070905685425, + "epoch": 0.4068627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020141493915539384, + "kl": 0.015826791524887085, + "learning_rate": 9.963379053731102e-07, + "loss": 0.0001, + "num_tokens": 10474757.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.478416919708252, + "sampling/importance_sampling_ratio/mean": 0.999915599822998, + "sampling/importance_sampling_ratio/min": 0.6468645334243774, + "sampling/sampling_logp_difference/max": 0.43561840057373047, + "sampling/sampling_logp_difference/mean": 0.01432250440120697, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 157.40625, + "completions/mean_terminated_length": 157.40625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.33026736974716187, + "epoch": 0.40808823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022846751025488637, + "kl": 0.01617838256061077, + "learning_rate": 9.96251340040798e-07, + "loss": 0.0002, + "num_tokens": 10501903.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4032772779464722, + "sampling/importance_sampling_ratio/mean": 1.000197172164917, + "sampling/importance_sampling_ratio/min": 0.6706700921058655, + "sampling/sampling_logp_difference/max": 0.3994779586791992, + "sampling/sampling_logp_difference/mean": 0.015687666833400726, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.35955965518951416, + "epoch": 0.40931372549019607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027193936788723276, + "kl": 0.012852571904659271, + "learning_rate": 9.96163767349704e-07, + "loss": 0.0001, + "num_tokens": 10534649.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6321485042572021, + "sampling/importance_sampling_ratio/mean": 0.9997122287750244, + "sampling/importance_sampling_ratio/min": 0.553581714630127, + "sampling/sampling_logp_difference/max": 0.5913459062576294, + "sampling/sampling_logp_difference/mean": 0.01636279746890068, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 177.984375, + "completions/mean_terminated_length": 177.984375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3666573166847229, + "epoch": 0.4105392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027948584748888235, + "kl": 0.015932895243167877, + "learning_rate": 9.96075187477595e-07, + "loss": 0.0002, + "num_tokens": 10563736.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5915312767028809, + "sampling/importance_sampling_ratio/mean": 1.0004807710647583, + "sampling/importance_sampling_ratio/min": 0.7182236313819885, + "sampling/sampling_logp_difference/max": 0.46469664573669434, + "sampling/sampling_logp_difference/mean": 0.015503861010074615, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 152.984375, + "completions/mean_terminated_length": 152.984375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.27830806374549866, + "epoch": 0.4117647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03193201309027661, + "kl": 0.014680283144116402, + "learning_rate": 9.959856006042828e-07, + "loss": 0.0001, + "num_tokens": 10591527.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5989633798599243, + "sampling/importance_sampling_ratio/mean": 1.0004345178604126, + "sampling/importance_sampling_ratio/min": 0.6358773708343506, + "sampling/sampling_logp_difference/max": 0.46935558319091797, + "sampling/sampling_logp_difference/mean": 0.013306674547493458, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 178.9375, + "completions/mean_terminated_length": 178.9375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3773849606513977, + "epoch": 0.41299019607843135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04655533106697762, + "kl": 0.023683395236730576, + "learning_rate": 9.95895006911623e-07, + "loss": 0.0002, + "num_tokens": 10624563.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5617190599441528, + "sampling/importance_sampling_ratio/mean": 1.0000473260879517, + "sampling/importance_sampling_ratio/min": 0.6068368554115295, + "sampling/sampling_logp_difference/max": 0.499495267868042, + "sampling/sampling_logp_difference/mean": 0.01680750772356987, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 164.3125, + "completions/mean_terminated_length": 164.3125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2574771046638489, + "epoch": 0.41421568627450983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026002327792873828, + "kl": 0.015414551831781864, + "learning_rate": 9.95803406583515e-07, + "loss": 0.0002, + "num_tokens": 10648855.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5451250076293945, + "sampling/importance_sampling_ratio/mean": 1.000146508216858, + "sampling/importance_sampling_ratio/min": 0.6684028506278992, + "sampling/sampling_logp_difference/max": 0.4351048469543457, + "sampling/sampling_logp_difference/mean": 0.01264483667910099, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 154.65625, + "completions/mean_terminated_length": 154.65625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3974250853061676, + "epoch": 0.41544117647058826, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.334713356953007, + "kl": 0.03008582815527916, + "learning_rate": 9.957107998059018e-07, + "loss": -0.0329, + "num_tokens": 10674385.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6329152584075928, + "sampling/importance_sampling_ratio/mean": 1.0004063844680786, + "sampling/importance_sampling_ratio/min": 0.6714426875114441, + "sampling/sampling_logp_difference/max": 0.49036693572998047, + "sampling/sampling_logp_difference/mean": 0.018222328275442123, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 203.875, + "completions/mean_terminated_length": 203.875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.38176703453063965, + "epoch": 0.4166666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018426472295113922, + "kl": 0.014108864590525627, + "learning_rate": 9.956171867667693e-07, + "loss": 0.0001, + "num_tokens": 10710585.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.820453405380249, + "sampling/importance_sampling_ratio/mean": 0.9997619390487671, + "sampling/importance_sampling_ratio/min": 0.5687109231948853, + "sampling/sampling_logp_difference/max": 0.5990855693817139, + "sampling/sampling_logp_difference/mean": 0.01673772558569908, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 120.515625, + "completions/mean_terminated_length": 120.515625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.320795476436615, + "epoch": 0.4178921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03431113829074164, + "kl": 0.02109440788626671, + "learning_rate": 9.955225676561459e-07, + "loss": 0.0002, + "num_tokens": 10731034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4996675252914429, + "sampling/importance_sampling_ratio/mean": 1.0003200769424438, + "sampling/importance_sampling_ratio/min": 0.6079776287078857, + "sampling/sampling_logp_difference/max": 0.497617244720459, + "sampling/sampling_logp_difference/mean": 0.01737777516245842, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 175.828125, + "completions/mean_terminated_length": 175.828125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2854253053665161, + "epoch": 0.41911764705882354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021964472680569124, + "kl": 0.012618260458111763, + "learning_rate": 9.954269426661022e-07, + "loss": 0.0001, + "num_tokens": 10766447.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4091451168060303, + "sampling/importance_sampling_ratio/mean": 0.999445378780365, + "sampling/importance_sampling_ratio/min": 0.6348267197608948, + "sampling/sampling_logp_difference/max": 0.4544031620025635, + "sampling/sampling_logp_difference/mean": 0.01373360026627779, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 180.5, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3288782835006714, + "epoch": 0.42034313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020066321352772268, + "kl": 0.012781183235347271, + "learning_rate": 9.953303119907513e-07, + "loss": 0.0001, + "num_tokens": 10795631.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6213734149932861, + "sampling/importance_sampling_ratio/mean": 0.9997131824493408, + "sampling/importance_sampling_ratio/min": 0.711065948009491, + "sampling/sampling_logp_difference/max": 0.4832735061645508, + "sampling/sampling_logp_difference/mean": 0.013720612972974777, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 198.28125, + "completions/mean_terminated_length": 198.28125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3548140525817871, + "epoch": 0.4215686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030670964298762787, + "kl": 0.017154211178421974, + "learning_rate": 9.952326758262472e-07, + "loss": 0.0002, + "num_tokens": 10827793.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.40550696849823, + "sampling/importance_sampling_ratio/mean": 1.0001940727233887, + "sampling/importance_sampling_ratio/min": 0.681526243686676, + "sampling/sampling_logp_difference/max": 0.383420467376709, + "sampling/sampling_logp_difference/mean": 0.013858755119144917, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 173.75, + "completions/mean_terminated_length": 173.75, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.24323208630084991, + "epoch": 0.4227941176470588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016511666124535206, + "kl": 0.00972837209701538, + "learning_rate": 9.95134034370785e-07, + "loss": 0.0001, + "num_tokens": 10852577.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6776573657989502, + "sampling/importance_sampling_ratio/mean": 1.0005297660827637, + "sampling/importance_sampling_ratio/min": 0.6209533214569092, + "sampling/sampling_logp_difference/max": 0.5173983573913574, + "sampling/sampling_logp_difference/mean": 0.01275689247995615, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 177.25, + "completions/mean_terminated_length": 177.25, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3620036840438843, + "epoch": 0.42401960784313725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025874137301171645, + "kl": 0.01582196354866028, + "learning_rate": 9.950343878246009e-07, + "loss": 0.0001, + "num_tokens": 10890145.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3052634000778198, + "sampling/importance_sampling_ratio/mean": 1.0003299713134766, + "sampling/importance_sampling_ratio/min": 0.6577809453010559, + "sampling/sampling_logp_difference/max": 0.4188833236694336, + "sampling/sampling_logp_difference/mean": 0.015653233975172043, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 186.359375, + "completions/mean_terminated_length": 186.359375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.37454864382743835, + "epoch": 0.4252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027276294431496238, + "kl": 0.01751563511788845, + "learning_rate": 9.949337363899708e-07, + "loss": 0.0002, + "num_tokens": 10917944.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.496646761894226, + "sampling/importance_sampling_ratio/mean": 1.0000839233398438, + "sampling/importance_sampling_ratio/min": 0.6151504516601562, + "sampling/sampling_logp_difference/max": 0.4858884811401367, + "sampling/sampling_logp_difference/mean": 0.016659047454595566, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 191.21875, + "completions/mean_terminated_length": 191.21875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.402404248714447, + "epoch": 0.4264705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02102957159592185, + "kl": 0.013128215447068214, + "learning_rate": 9.948320802712107e-07, + "loss": 0.0001, + "num_tokens": 10945462.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2981833219528198, + "sampling/importance_sampling_ratio/mean": 1.0000042915344238, + "sampling/importance_sampling_ratio/min": 0.6452677845954895, + "sampling/sampling_logp_difference/max": 0.43808984756469727, + "sampling/sampling_logp_difference/mean": 0.01635223627090454, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 205.875, + "completions/mean_terminated_length": 205.875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.30062228441238403, + "epoch": 0.42769607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01587966621073377, + "kl": 0.012505080550909042, + "learning_rate": 9.947294196746762e-07, + "loss": 0.0001, + "num_tokens": 10977438.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6356579065322876, + "sampling/importance_sampling_ratio/mean": 1.0002219676971436, + "sampling/importance_sampling_ratio/min": 0.6478670239448547, + "sampling/sampling_logp_difference/max": 0.49204516410827637, + "sampling/sampling_logp_difference/mean": 0.01302589476108551, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 174.703125, + "completions/mean_terminated_length": 174.703125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2981036305427551, + "epoch": 0.42892156862745096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01907727053640788, + "kl": 0.011256780475378036, + "learning_rate": 9.946257548087619e-07, + "loss": 0.0001, + "num_tokens": 11004475.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4305928945541382, + "sampling/importance_sampling_ratio/mean": 1.000324010848999, + "sampling/importance_sampling_ratio/min": 0.6029837131500244, + "sampling/sampling_logp_difference/max": 0.5058650970458984, + "sampling/sampling_logp_difference/mean": 0.012895691208541393, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 175.875, + "completions/mean_terminated_length": 175.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.29742923378944397, + "epoch": 0.43014705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013485970659333622, + "kl": 0.009871533140540123, + "learning_rate": 9.945210858839008e-07, + "loss": 0.0001, + "num_tokens": 11032547.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5784273147583008, + "sampling/importance_sampling_ratio/mean": 0.9997193217277527, + "sampling/importance_sampling_ratio/min": 0.6298893690109253, + "sampling/sampling_logp_difference/max": 0.46221113204956055, + "sampling/sampling_logp_difference/mean": 0.013206500560045242, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 253.390625, + "completions/mean_terminated_length": 253.390625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.33359986543655396, + "epoch": 0.43137254901960786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01343310501689489, + "kl": 0.00943258497864008, + "learning_rate": 9.944154131125642e-07, + "loss": 0.0001, + "num_tokens": 11066620.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9546880722045898, + "sampling/importance_sampling_ratio/mean": 1.0005977153778076, + "sampling/importance_sampling_ratio/min": 0.6223419308662415, + "sampling/sampling_logp_difference/max": 0.6702306270599365, + "sampling/sampling_logp_difference/mean": 0.013398932293057442, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 254.5, + "completions/mean_terminated_length": 254.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.3880694508552551, + "epoch": 0.4325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012760186849020817, + "kl": 0.009503064677119255, + "learning_rate": 9.94308736709261e-07, + "loss": 0.0001, + "num_tokens": 11103452.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5074957609176636, + "sampling/importance_sampling_ratio/mean": 1.0005195140838623, + "sampling/importance_sampling_ratio/min": 0.6032688617706299, + "sampling/sampling_logp_difference/max": 0.50539231300354, + "sampling/sampling_logp_difference/mean": 0.01578535884618759, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 196.46875, + "completions/mean_terminated_length": 196.46875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3384985625743866, + "epoch": 0.4338235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018559612037949657, + "kl": 0.011546864174306393, + "learning_rate": 9.94201056890538e-07, + "loss": 0.0001, + "num_tokens": 11133242.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4395679235458374, + "sampling/importance_sampling_ratio/mean": 1.0003654956817627, + "sampling/importance_sampling_ratio/min": 0.703487753868103, + "sampling/sampling_logp_difference/max": 0.36434292793273926, + "sampling/sampling_logp_difference/mean": 0.015101935714483261, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 231.25, + "completions/mean_terminated_length": 231.25, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3752257227897644, + "epoch": 0.43504901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020685644073571794, + "kl": 0.012361589819192886, + "learning_rate": 9.940923738749777e-07, + "loss": 0.0001, + "num_tokens": 11165290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4209574460983276, + "sampling/importance_sampling_ratio/mean": 0.999468207359314, + "sampling/importance_sampling_ratio/min": 0.535356342792511, + "sampling/sampling_logp_difference/max": 0.6248226165771484, + "sampling/sampling_logp_difference/mean": 0.014410814270377159, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 216.15625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4338979423046112, + "epoch": 0.4362745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013385996902813196, + "kl": 0.010194092988967896, + "learning_rate": 9.939826878832003e-07, + "loss": 0.0001, + "num_tokens": 11194596.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5806500911712646, + "sampling/importance_sampling_ratio/mean": 1.0002678632736206, + "sampling/importance_sampling_ratio/min": 0.6268720626831055, + "sampling/sampling_logp_difference/max": 0.467012882232666, + "sampling/sampling_logp_difference/mean": 0.017618417739868164, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 221.03125, + "completions/mean_terminated_length": 221.03125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.40737539529800415, + "epoch": 0.4375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01643896331280935, + "kl": 0.01152168121188879, + "learning_rate": 9.938719991378613e-07, + "loss": 0.0001, + "num_tokens": 11230726.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5277706384658813, + "sampling/importance_sampling_ratio/mean": 1.0003275871276855, + "sampling/importance_sampling_ratio/min": 0.6622360348701477, + "sampling/sampling_logp_difference/max": 0.4238095283508301, + "sampling/sampling_logp_difference/mean": 0.015043385326862335, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 231.75, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.38174504041671753, + "epoch": 0.4387254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013168161127015283, + "kl": 0.008941764943301678, + "learning_rate": 9.937603078636518e-07, + "loss": 0.0001, + "num_tokens": 11271110.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.675676941871643, + "sampling/importance_sampling_ratio/mean": 1.0006225109100342, + "sampling/importance_sampling_ratio/min": 0.5020617842674255, + "sampling/sampling_logp_difference/max": 0.6890320777893066, + "sampling/sampling_logp_difference/mean": 0.016321398317813873, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 160.40625, + "completions/mean_terminated_length": 160.40625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3537214398384094, + "epoch": 0.43995098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018349232248083134, + "kl": 0.014196186326444149, + "learning_rate": 9.936476142872977e-07, + "loss": 0.0001, + "num_tokens": 11294928.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3459526300430298, + "sampling/importance_sampling_ratio/mean": 1.0004903078079224, + "sampling/importance_sampling_ratio/min": 0.6110498309135437, + "sampling/sampling_logp_difference/max": 0.49257683753967285, + "sampling/sampling_logp_difference/mean": 0.01667492464184761, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 200.3125, + "completions/mean_terminated_length": 200.3125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.42494967579841614, + "epoch": 0.4411764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018074421995676333, + "kl": 0.012515506707131863, + "learning_rate": 9.935339186375603e-07, + "loss": 0.0001, + "num_tokens": 11328580.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4254281520843506, + "sampling/importance_sampling_ratio/mean": 0.9997589588165283, + "sampling/importance_sampling_ratio/min": 0.6549595594406128, + "sampling/sampling_logp_difference/max": 0.42318177223205566, + "sampling/sampling_logp_difference/mean": 0.017285365611314774, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 278.140625, + "completions/mean_terminated_length": 278.140625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.41641536355018616, + "epoch": 0.4424019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009015352703555118, + "kl": 0.006647173315286636, + "learning_rate": 9.934192211452344e-07, + "loss": 0.0001, + "num_tokens": 11372749.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4884394407272339, + "sampling/importance_sampling_ratio/mean": 0.9997484087944031, + "sampling/importance_sampling_ratio/min": 0.629547655582428, + "sampling/sampling_logp_difference/max": 0.4627537727355957, + "sampling/sampling_logp_difference/mean": 0.01442030444741249, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 607.0, + "completions/max_terminated_length": 607.0, + "completions/mean_length": 247.359375, + "completions/mean_terminated_length": 247.359375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.35651469230651855, + "epoch": 0.44362745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011251895085230473, + "kl": 0.008346650749444962, + "learning_rate": 9.933035220431487e-07, + "loss": 0.0001, + "num_tokens": 11406612.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.597931146621704, + "sampling/importance_sampling_ratio/mean": 1.0006805658340454, + "sampling/importance_sampling_ratio/min": 0.6154104471206665, + "sampling/sampling_logp_difference/max": 0.48546576499938965, + "sampling/sampling_logp_difference/mean": 0.013788014650344849, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 199.109375, + "completions/mean_terminated_length": 199.109375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4216683506965637, + "epoch": 0.44485294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017696291196703195, + "kl": 0.011852515861392021, + "learning_rate": 9.931868215661647e-07, + "loss": 0.0001, + "num_tokens": 11435675.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5903968811035156, + "sampling/importance_sampling_ratio/mean": 0.9999369978904724, + "sampling/importance_sampling_ratio/min": 0.6195716857910156, + "sampling/sampling_logp_difference/max": 0.478726863861084, + "sampling/sampling_logp_difference/mean": 0.01574837602674961, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 199.15625, + "completions/mean_terminated_length": 199.15625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3079066276550293, + "epoch": 0.44607843137254904, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014264489431204608, + "kl": 0.008546917699277401, + "learning_rate": 9.930691199511773e-07, + "loss": 0.0001, + "num_tokens": 11461461.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.596198558807373, + "sampling/importance_sampling_ratio/mean": 0.999616801738739, + "sampling/importance_sampling_ratio/min": 0.6264073848724365, + "sampling/sampling_logp_difference/max": 0.4677543640136719, + "sampling/sampling_logp_difference/mean": 0.01515682227909565, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 211.71875, + "completions/mean_terminated_length": 211.71875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3385094106197357, + "epoch": 0.44730392156862747, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01982976435718639, + "kl": 0.011798565275967121, + "learning_rate": 9.929504174371136e-07, + "loss": 0.0001, + "num_tokens": 11492899.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.608918309211731, + "sampling/importance_sampling_ratio/mean": 0.9996719360351562, + "sampling/importance_sampling_ratio/min": 0.6418868899345398, + "sampling/sampling_logp_difference/max": 0.47556209564208984, + "sampling/sampling_logp_difference/mean": 0.013919688761234283, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 154.6875, + "completions/mean_terminated_length": 154.6875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35065576434135437, + "epoch": 0.4485294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01633451405040199, + "kl": 0.010045424103736877, + "learning_rate": 9.928307142649314e-07, + "loss": 0.0001, + "num_tokens": 11517439.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5628693103790283, + "sampling/importance_sampling_ratio/mean": 1.0001769065856934, + "sampling/importance_sampling_ratio/min": 0.6255244612693787, + "sampling/sampling_logp_difference/max": 0.4691648483276367, + "sampling/sampling_logp_difference/mean": 0.01635180599987507, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 177.625, + "completions/mean_terminated_length": 177.625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.33547794818878174, + "epoch": 0.4497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019456791341792457, + "kl": 0.012170180678367615, + "learning_rate": 9.927100106776212e-07, + "loss": 0.0001, + "num_tokens": 11543607.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5203474760055542, + "sampling/importance_sampling_ratio/mean": 0.9998880624771118, + "sampling/importance_sampling_ratio/min": 0.6530148983001709, + "sampling/sampling_logp_difference/max": 0.42615532875061035, + "sampling/sampling_logp_difference/mean": 0.014844020828604698, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 172.515625, + "completions/mean_terminated_length": 172.515625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4139445126056671, + "epoch": 0.45098039215686275, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01405004602057797, + "kl": 0.010405349545180798, + "learning_rate": 9.925883069202034e-07, + "loss": 0.0001, + "num_tokens": 11573224.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3822776079177856, + "sampling/importance_sampling_ratio/mean": 0.9990941286087036, + "sampling/importance_sampling_ratio/min": 0.6073088645935059, + "sampling/sampling_logp_difference/max": 0.4987177848815918, + "sampling/sampling_logp_difference/mean": 0.01969890296459198, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 222.3125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.419656366109848, + "epoch": 0.4522058823529412, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9301273362582549, + "kl": 0.010022681206464767, + "learning_rate": 9.92465603239729e-07, + "loss": -0.0052, + "num_tokens": 11606044.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4288361072540283, + "sampling/importance_sampling_ratio/mean": 0.9995024800300598, + "sampling/importance_sampling_ratio/min": 0.6839989423751831, + "sampling/sampling_logp_difference/max": 0.37979888916015625, + "sampling/sampling_logp_difference/mean": 0.01661648415029049, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 190.59375, + "completions/mean_terminated_length": 190.59375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3726652264595032, + "epoch": 0.4534313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013424665837497438, + "kl": 0.009182492271065712, + "learning_rate": 9.923418998852787e-07, + "loss": 0.0001, + "num_tokens": 11632482.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4375884532928467, + "sampling/importance_sampling_ratio/mean": 1.0001118183135986, + "sampling/importance_sampling_ratio/min": 0.5601235032081604, + "sampling/sampling_logp_difference/max": 0.5795979499816895, + "sampling/sampling_logp_difference/mean": 0.017543703317642212, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 192.09375, + "completions/mean_terminated_length": 192.09375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.35122162103652954, + "epoch": 0.45465686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015861282409031972, + "kl": 0.009136844426393509, + "learning_rate": 9.922171971079622e-07, + "loss": 0.0001, + "num_tokens": 11661432.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.329856276512146, + "sampling/importance_sampling_ratio/mean": 1.0001051425933838, + "sampling/importance_sampling_ratio/min": 0.6267584562301636, + "sampling/sampling_logp_difference/max": 0.4671940803527832, + "sampling/sampling_logp_difference/mean": 0.01532377116382122, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 187.515625, + "completions/mean_terminated_length": 187.515625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3589586019515991, + "epoch": 0.45588235294117646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018064289896045117, + "kl": 0.010359864681959152, + "learning_rate": 9.920914951609186e-07, + "loss": 0.0001, + "num_tokens": 11691545.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.505644679069519, + "sampling/importance_sampling_ratio/mean": 0.9999890327453613, + "sampling/importance_sampling_ratio/min": 0.6101424098014832, + "sampling/sampling_logp_difference/max": 0.4940629005432129, + "sampling/sampling_logp_difference/mean": 0.015188705176115036, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 171.90625, + "completions/mean_terminated_length": 171.90625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.33146971464157104, + "epoch": 0.4571078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01367888723644528, + "kl": 0.00988725759088993, + "learning_rate": 9.919647942993147e-07, + "loss": 0.0001, + "num_tokens": 11720611.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3563395738601685, + "sampling/importance_sampling_ratio/mean": 1.0000863075256348, + "sampling/importance_sampling_ratio/min": 0.5471782684326172, + "sampling/sampling_logp_difference/max": 0.6029806137084961, + "sampling/sampling_logp_difference/mean": 0.016517682000994682, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 229.578125, + "completions/mean_terminated_length": 229.578125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3728431463241577, + "epoch": 0.4583333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.181335991355015, + "kl": 0.008848130702972412, + "learning_rate": 9.918370947803455e-07, + "loss": -0.0115, + "num_tokens": 11758824.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4291181564331055, + "sampling/importance_sampling_ratio/mean": 0.9998779296875, + "sampling/importance_sampling_ratio/min": 0.6423242688179016, + "sampling/sampling_logp_difference/max": 0.44266200065612793, + "sampling/sampling_logp_difference/mean": 0.014619621448218822, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 221.59375, + "completions/mean_terminated_length": 221.59375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.297640323638916, + "epoch": 0.45955882352941174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010795928871486645, + "kl": 0.008283271454274654, + "learning_rate": 9.917083968632326e-07, + "loss": 0.0001, + "num_tokens": 11788430.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.507724642753601, + "sampling/importance_sampling_ratio/mean": 0.9998579025268555, + "sampling/importance_sampling_ratio/min": 0.6445702910423279, + "sampling/sampling_logp_difference/max": 0.4391714334487915, + "sampling/sampling_logp_difference/mean": 0.013588126748800278, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 212.171875, + "completions/mean_terminated_length": 212.171875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.41798919439315796, + "epoch": 0.46078431372549017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0345409906844456, + "kl": 0.016850750893354416, + "learning_rate": 9.915787008092246e-07, + "loss": 0.0002, + "num_tokens": 11824425.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6266127824783325, + "sampling/importance_sampling_ratio/mean": 1.0001428127288818, + "sampling/importance_sampling_ratio/min": 0.541965663433075, + "sampling/sampling_logp_difference/max": 0.6125526428222656, + "sampling/sampling_logp_difference/mean": 0.016644926741719246, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 159.953125, + "completions/mean_terminated_length": 159.953125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3381967842578888, + "epoch": 0.46200980392156865, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015688429278366736, + "kl": 0.010663250461220741, + "learning_rate": 9.914480068815961e-07, + "loss": 0.0001, + "num_tokens": 11854566.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.0000499486923218, + "sampling/importance_sampling_ratio/min": 0.5363562703132629, + "sampling/sampling_logp_difference/max": 0.6229566335678101, + "sampling/sampling_logp_difference/mean": 0.015087351202964783, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.313782274723053, + "epoch": 0.4632352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012745304946639297, + "kl": 0.00800234079360962, + "learning_rate": 9.913163153456482e-07, + "loss": 0.0001, + "num_tokens": 11881374.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6307555437088013, + "sampling/importance_sampling_ratio/mean": 1.00034761428833, + "sampling/importance_sampling_ratio/min": 0.6225945353507996, + "sampling/sampling_logp_difference/max": 0.4890434741973877, + "sampling/sampling_logp_difference/mean": 0.014235056936740875, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 249.546875, + "completions/mean_terminated_length": 249.546875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.32097193598747253, + "epoch": 0.4644607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009852273938805821, + "kl": 0.005863037426024675, + "learning_rate": 9.91183626468706e-07, + "loss": 0.0001, + "num_tokens": 11916305.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.434583306312561, + "sampling/importance_sampling_ratio/mean": 1.0002024173736572, + "sampling/importance_sampling_ratio/min": 0.6957331895828247, + "sampling/sampling_logp_difference/max": 0.3627890348434448, + "sampling/sampling_logp_difference/mean": 0.01312682218849659, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 248.1875, + "completions/mean_terminated_length": 248.1875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.4050544798374176, + "epoch": 0.46568627450980393, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016587628782540324, + "kl": 0.008186952210962772, + "learning_rate": 9.910499405201193e-07, + "loss": 0.0001, + "num_tokens": 11951053.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4755452871322632, + "sampling/importance_sampling_ratio/mean": 1.0002648830413818, + "sampling/importance_sampling_ratio/min": 0.6298418641090393, + "sampling/sampling_logp_difference/max": 0.46228647232055664, + "sampling/sampling_logp_difference/mean": 0.01567826420068741, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4453149139881134, + "epoch": 0.46691176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8653692004783323, + "kl": 0.01589384488761425, + "learning_rate": 9.909152577712625e-07, + "loss": -0.0128, + "num_tokens": 11977769.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3567479848861694, + "sampling/importance_sampling_ratio/mean": 0.9998579025268555, + "sampling/importance_sampling_ratio/min": 0.6224786043167114, + "sampling/sampling_logp_difference/max": 0.474045991897583, + "sampling/sampling_logp_difference/mean": 0.019120272248983383, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 222.3125, + "completions/mean_terminated_length": 222.3125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.24300457537174225, + "epoch": 0.4681372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01401523700410047, + "kl": 0.00933674443513155, + "learning_rate": 9.907795784955326e-07, + "loss": 0.0001, + "num_tokens": 12008637.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.404148817062378, + "sampling/importance_sampling_ratio/mean": 0.9994839429855347, + "sampling/importance_sampling_ratio/min": 0.7045887112617493, + "sampling/sampling_logp_difference/max": 0.3501410186290741, + "sampling/sampling_logp_difference/mean": 0.011331282556056976, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 246.21875, + "completions/mean_terminated_length": 246.21875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3826541304588318, + "epoch": 0.4693627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8061377842091467, + "kl": 0.009127024561166763, + "learning_rate": 9.906429029683504e-07, + "loss": 0.0324, + "num_tokens": 12041547.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.3910638093948364, + "sampling/importance_sampling_ratio/mean": 0.9997413158416748, + "sampling/importance_sampling_ratio/min": 0.6038206815719604, + "sampling/sampling_logp_difference/max": 0.5044779777526855, + "sampling/sampling_logp_difference/mean": 0.014083616435527802, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 221.765625, + "completions/mean_terminated_length": 221.765625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3471091687679291, + "epoch": 0.47058823529411764, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8550076145586865, + "kl": 0.010176371783018112, + "learning_rate": 9.90505231467158e-07, + "loss": -0.0033, + "num_tokens": 12078172.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5535870790481567, + "sampling/importance_sampling_ratio/mean": 0.9999591708183289, + "sampling/importance_sampling_ratio/min": 0.6087507009506226, + "sampling/sampling_logp_difference/max": 0.49634647369384766, + "sampling/sampling_logp_difference/mean": 0.014170932583510876, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 243.6875, + "completions/mean_terminated_length": 243.6875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.32598116993904114, + "epoch": 0.47181372549019607, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017329786007415797, + "kl": 0.01097575668245554, + "learning_rate": 9.903665642714204e-07, + "loss": 0.0001, + "num_tokens": 12111400.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.510948896408081, + "sampling/importance_sampling_ratio/mean": 0.9996094703674316, + "sampling/importance_sampling_ratio/min": 0.5932686924934387, + "sampling/sampling_logp_difference/max": 0.5221078395843506, + "sampling/sampling_logp_difference/mean": 0.012972252443432808, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 221.65625, + "completions/mean_terminated_length": 221.65625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3300212621688843, + "epoch": 0.4730392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012453816009873477, + "kl": 0.008042341098189354, + "learning_rate": 9.90226901662623e-07, + "loss": 0.0001, + "num_tokens": 12141218.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5358397960662842, + "sampling/importance_sampling_ratio/mean": 1.0001249313354492, + "sampling/importance_sampling_ratio/min": 0.2011961191892624, + "sampling/sampling_logp_difference/max": 1.6034750938415527, + "sampling/sampling_logp_difference/mean": 0.017671309411525726, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 214.640625, + "completions/mean_terminated_length": 214.640625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.35195472836494446, + "epoch": 0.4742647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016225673081601708, + "kl": 0.009677985683083534, + "learning_rate": 9.900862439242718e-07, + "loss": 0.0001, + "num_tokens": 12172651.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4994847774505615, + "sampling/importance_sampling_ratio/mean": 0.9996469616889954, + "sampling/importance_sampling_ratio/min": 0.5029210448265076, + "sampling/sampling_logp_difference/max": 0.6873221397399902, + "sampling/sampling_logp_difference/mean": 0.015036087483167648, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 201.375, + "completions/mean_terminated_length": 201.375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.39445775747299194, + "epoch": 0.47549019607843135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01883372708158617, + "kl": 0.009737227112054825, + "learning_rate": 9.899445913418935e-07, + "loss": 0.0001, + "num_tokens": 12207075.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.554342269897461, + "sampling/importance_sampling_ratio/mean": 1.0000633001327515, + "sampling/importance_sampling_ratio/min": 0.6536708474159241, + "sampling/sampling_logp_difference/max": 0.4410524368286133, + "sampling/sampling_logp_difference/mean": 0.015788394957780838, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 213.484375, + "completions/mean_terminated_length": 213.484375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.31441351771354675, + "epoch": 0.47671568627450983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02090207683091764, + "kl": 0.010597823187708855, + "learning_rate": 9.898019442030337e-07, + "loss": 0.0001, + "num_tokens": 12235378.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9693632125854492, + "sampling/importance_sampling_ratio/mean": 1.0002236366271973, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.6777102947235107, + "sampling/sampling_logp_difference/mean": 0.015006947331130505, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 167.25, + "completions/mean_terminated_length": 167.25, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.43768465518951416, + "epoch": 0.47794117647058826, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022032048994189212, + "kl": 0.01550086960196495, + "learning_rate": 9.89658302797257e-07, + "loss": 0.0001, + "num_tokens": 12260194.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5772755146026611, + "sampling/importance_sampling_ratio/mean": 1.0005762577056885, + "sampling/importance_sampling_ratio/min": 0.6577732563018799, + "sampling/sampling_logp_difference/max": 0.45569896697998047, + "sampling/sampling_logp_difference/mean": 0.018796630203723907, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 192.140625, + "completions/mean_terminated_length": 192.140625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.340323269367218, + "epoch": 0.4791666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01645434437036542, + "kl": 0.009912341833114624, + "learning_rate": 9.895136674161464e-07, + "loss": 0.0001, + "num_tokens": 12289787.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6626406908035278, + "sampling/importance_sampling_ratio/mean": 1.0005561113357544, + "sampling/importance_sampling_ratio/min": 0.7020365595817566, + "sampling/sampling_logp_difference/max": 0.5084071159362793, + "sampling/sampling_logp_difference/mean": 0.01474553719162941, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 213.734375, + "completions/mean_terminated_length": 213.734375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.48565277457237244, + "epoch": 0.4803921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0367646587482953, + "kl": 0.014473862014710903, + "learning_rate": 9.893680383533024e-07, + "loss": -0.002, + "num_tokens": 12323146.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 0.9999999403953552, + "sampling/importance_sampling_ratio/min": 0.6321954727172852, + "sampling/sampling_logp_difference/max": 0.4585566520690918, + "sampling/sampling_logp_difference/mean": 0.019393594935536385, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 237.09375, + "completions/mean_terminated_length": 237.09375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3610077202320099, + "epoch": 0.48161764705882354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023214835044770902, + "kl": 0.011279666796326637, + "learning_rate": 9.892214159043433e-07, + "loss": 0.0001, + "num_tokens": 12358272.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5280091762542725, + "sampling/importance_sampling_ratio/mean": 1.00017249584198, + "sampling/importance_sampling_ratio/min": 0.6086266040802002, + "sampling/sampling_logp_difference/max": 0.4965503215789795, + "sampling/sampling_logp_difference/mean": 0.01587025076150894, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 172.328125, + "completions/mean_terminated_length": 172.328125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3241961598396301, + "epoch": 0.48284313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027468222552452255, + "kl": 0.011704692617058754, + "learning_rate": 9.890738003669027e-07, + "loss": 0.0001, + "num_tokens": 12384885.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.405652642250061, + "sampling/importance_sampling_ratio/mean": 0.9996992349624634, + "sampling/importance_sampling_ratio/min": 0.6394681334495544, + "sampling/sampling_logp_difference/max": 0.44711852073669434, + "sampling/sampling_logp_difference/mean": 0.015274315141141415, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 219.75, + "completions/mean_terminated_length": 219.75, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.37035873532295227, + "epoch": 0.4840686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017759354131219995, + "kl": 0.011212404817342758, + "learning_rate": 9.889251920406312e-07, + "loss": 0.0001, + "num_tokens": 12413861.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4374908208847046, + "sampling/importance_sampling_ratio/mean": 0.9999295473098755, + "sampling/importance_sampling_ratio/min": 0.5719456076622009, + "sampling/sampling_logp_difference/max": 0.5587114095687866, + "sampling/sampling_logp_difference/mean": 0.015050732530653477, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 237.125, + "completions/mean_terminated_length": 237.125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.2652778625488281, + "epoch": 0.4852941176470588, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01741930717185592, + "kl": 0.008691548369824886, + "learning_rate": 9.887755912271942e-07, + "loss": 0.0001, + "num_tokens": 12446029.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.372191071510315, + "sampling/importance_sampling_ratio/mean": 0.9995167255401611, + "sampling/importance_sampling_ratio/min": 0.6255383491516113, + "sampling/sampling_logp_difference/max": 0.4691426753997803, + "sampling/sampling_logp_difference/mean": 0.012035621330142021, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 216.15625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.33375227451324463, + "epoch": 0.48651960784313725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02535238968033679, + "kl": 0.011992005631327629, + "learning_rate": 9.886249982302718e-07, + "loss": 0.0001, + "num_tokens": 12477911.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.620891809463501, + "sampling/importance_sampling_ratio/mean": 0.9997732639312744, + "sampling/importance_sampling_ratio/min": 0.6178363561630249, + "sampling/sampling_logp_difference/max": 0.48297643661499023, + "sampling/sampling_logp_difference/mean": 0.014461209997534752, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 235.25, + "completions/mean_terminated_length": 235.25, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3546869158744812, + "epoch": 0.4877450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8350613153136783, + "kl": 0.016289301216602325, + "learning_rate": 9.884734133555585e-07, + "loss": -0.0084, + "num_tokens": 12509975.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.8640707731246948, + "sampling/importance_sampling_ratio/mean": 1.000771164894104, + "sampling/importance_sampling_ratio/min": 0.6371414065361023, + "sampling/sampling_logp_difference/max": 0.6227626800537109, + "sampling/sampling_logp_difference/mean": 0.015145364217460155, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 217.359375, + "completions/mean_terminated_length": 217.359375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3628026247024536, + "epoch": 0.4889705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0230960872634931, + "kl": 0.010546230711042881, + "learning_rate": 9.883208369107617e-07, + "loss": 0.0001, + "num_tokens": 12539774.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4766346216201782, + "sampling/importance_sampling_ratio/mean": 0.9998975396156311, + "sampling/importance_sampling_ratio/min": 0.5483725070953369, + "sampling/sampling_logp_difference/max": 0.6008005142211914, + "sampling/sampling_logp_difference/mean": 0.015969838947057724, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 181.75, + "completions/mean_terminated_length": 181.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3053513169288635, + "epoch": 0.49019607843137253, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040026206678680114, + "kl": 0.01512863114476204, + "learning_rate": 9.88167269205602e-07, + "loss": 0.0001, + "num_tokens": 12564590.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4661747217178345, + "sampling/importance_sampling_ratio/mean": 0.9999059438705444, + "sampling/importance_sampling_ratio/min": 0.6298375129699707, + "sampling/sampling_logp_difference/max": 0.4622933864593506, + "sampling/sampling_logp_difference/mean": 0.01319966372102499, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 231.4375, + "completions/mean_terminated_length": 231.4375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.338267982006073, + "epoch": 0.49142156862745096, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01592680116951382, + "kl": 0.009573228657245636, + "learning_rate": 9.880127105518122e-07, + "loss": 0.0001, + "num_tokens": 12597114.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4357717037200928, + "sampling/importance_sampling_ratio/mean": 0.999531626701355, + "sampling/importance_sampling_ratio/min": 0.45780494809150696, + "sampling/sampling_logp_difference/max": 0.781312108039856, + "sampling/sampling_logp_difference/mean": 0.01465204730629921, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 177.46875, + "completions/mean_terminated_length": 177.46875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.42952173948287964, + "epoch": 0.49264705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0164482068868528, + "kl": 0.01893451064825058, + "learning_rate": 9.878571612631363e-07, + "loss": -0.013, + "num_tokens": 12623736.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5284698009490967, + "sampling/importance_sampling_ratio/mean": 1.0002241134643555, + "sampling/importance_sampling_ratio/min": 0.601417601108551, + "sampling/sampling_logp_difference/max": 0.5084657669067383, + "sampling/sampling_logp_difference/mean": 0.019010312855243683, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 192.96875, + "completions/mean_terminated_length": 192.96875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3650784492492676, + "epoch": 0.49387254901960786, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0365249853735248, + "kl": 0.0104802455753088, + "learning_rate": 9.8770062165533e-07, + "loss": 0.0001, + "num_tokens": 12658470.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5172513723373413, + "sampling/importance_sampling_ratio/mean": 0.999741792678833, + "sampling/importance_sampling_ratio/min": 0.6829821467399597, + "sampling/sampling_logp_difference/max": 0.4169003963470459, + "sampling/sampling_logp_difference/mean": 0.015502199530601501, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 161.78125, + "completions/mean_terminated_length": 161.78125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35916945338249207, + "epoch": 0.4950980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10011804646684393, + "kl": 0.02148270606994629, + "learning_rate": 9.875430920461583e-07, + "loss": 0.0002, + "num_tokens": 12686120.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.454472541809082, + "sampling/importance_sampling_ratio/mean": 0.9993026256561279, + "sampling/importance_sampling_ratio/min": 0.47207364439964294, + "sampling/sampling_logp_difference/max": 0.7506203651428223, + "sampling/sampling_logp_difference/mean": 0.016054194420576096, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 194.75, + "completions/mean_terminated_length": 194.75, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.31307125091552734, + "epoch": 0.4963235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03287256451829378, + "kl": 0.01108582690358162, + "learning_rate": 9.873845727553965e-07, + "loss": 0.0001, + "num_tokens": 12720904.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.479647159576416, + "sampling/importance_sampling_ratio/mean": 0.9998552203178406, + "sampling/importance_sampling_ratio/min": 0.6676144599914551, + "sampling/sampling_logp_difference/max": 0.40404438972473145, + "sampling/sampling_logp_difference/mean": 0.013567068614065647, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 181.296875, + "completions/mean_terminated_length": 181.296875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4192981421947479, + "epoch": 0.49754901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022305365019105922, + "kl": 0.013657962903380394, + "learning_rate": 9.87225064104829e-07, + "loss": 0.0001, + "num_tokens": 12747435.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4031422138214111, + "sampling/importance_sampling_ratio/mean": 0.9997045397758484, + "sampling/importance_sampling_ratio/min": 0.6355483531951904, + "sampling/sampling_logp_difference/max": 0.45326709747314453, + "sampling/sampling_logp_difference/mean": 0.01757827028632164, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 226.03125, + "completions/mean_terminated_length": 226.03125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.30763596296310425, + "epoch": 0.4987745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023753100882734985, + "kl": 0.01026175543665886, + "learning_rate": 9.870645664182476e-07, + "loss": 0.0001, + "num_tokens": 12778301.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5588346719741821, + "sampling/importance_sampling_ratio/mean": 0.9999215602874756, + "sampling/importance_sampling_ratio/min": 0.6088090538978577, + "sampling/sampling_logp_difference/max": 0.49625062942504883, + "sampling/sampling_logp_difference/mean": 0.013904592022299767, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 170.84375, + "completions/mean_terminated_length": 170.84375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3431023955345154, + "epoch": 0.5, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024057986878457543, + "kl": 0.013347751460969448, + "learning_rate": 9.86903080021453e-07, + "loss": 0.0001, + "num_tokens": 12807619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5784333944320679, + "sampling/importance_sampling_ratio/mean": 0.999455988407135, + "sampling/importance_sampling_ratio/min": 0.5910114645957947, + "sampling/sampling_logp_difference/max": 0.5259199142456055, + "sampling/sampling_logp_difference/mean": 0.01554157119244337, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 241.53125, + "completions/mean_terminated_length": 241.53125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.35906076431274414, + "epoch": 0.5012254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014658573627134372, + "kl": 0.009751342236995697, + "learning_rate": 9.867406052422523e-07, + "loss": 0.0001, + "num_tokens": 12845237.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9751145839691162, + "sampling/importance_sampling_ratio/mean": 1.0009266138076782, + "sampling/importance_sampling_ratio/min": 0.6303215622901917, + "sampling/sampling_logp_difference/max": 0.680626392364502, + "sampling/sampling_logp_difference/mean": 0.014839326031506062, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 253.65625, + "completions/mean_terminated_length": 253.65625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.37862318754196167, + "epoch": 0.5024509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7770336075388821, + "kl": 0.011146817356348038, + "learning_rate": 9.865771424104587e-07, + "loss": -0.0127, + "num_tokens": 12878559.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.475413203239441, + "sampling/importance_sampling_ratio/mean": 0.9998399019241333, + "sampling/importance_sampling_ratio/min": 0.6268531680107117, + "sampling/sampling_logp_difference/max": 0.4670429229736328, + "sampling/sampling_logp_difference/mean": 0.014631716534495354, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 251.28125, + "completions/mean_terminated_length": 251.28125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.42295581102371216, + "epoch": 0.5036764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5690485828250321, + "kl": 0.010090619325637817, + "learning_rate": 9.864126918578919e-07, + "loss": -0.0016, + "num_tokens": 12912721.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.2837777137756348, + "sampling/importance_sampling_ratio/mean": 1.000241994857788, + "sampling/importance_sampling_ratio/min": 0.5151903033256531, + "sampling/sampling_logp_difference/max": 0.6632189750671387, + "sampling/sampling_logp_difference/mean": 0.015532649122178555, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 175.734375, + "completions/mean_terminated_length": 175.734375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.41181814670562744, + "epoch": 0.5049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01816848416533495, + "kl": 0.01359694916754961, + "learning_rate": 9.862472539183755e-07, + "loss": 0.0001, + "num_tokens": 12939456.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5215433835983276, + "sampling/importance_sampling_ratio/mean": 1.0003886222839355, + "sampling/importance_sampling_ratio/min": 0.5691208839416504, + "sampling/sampling_logp_difference/max": 0.5636624097824097, + "sampling/sampling_logp_difference/mean": 0.017363913357257843, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3904797434806824, + "epoch": 0.5061274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018690357847311847, + "kl": 0.013147925958037376, + "learning_rate": 9.860808289277385e-07, + "loss": 0.0001, + "num_tokens": 12967208.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4828284978866577, + "sampling/importance_sampling_ratio/mean": 1.0003409385681152, + "sampling/importance_sampling_ratio/min": 0.6065059304237366, + "sampling/sampling_logp_difference/max": 0.5000407695770264, + "sampling/sampling_logp_difference/mean": 0.01624198630452156, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 195.1875, + "completions/mean_terminated_length": 195.1875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.30729755759239197, + "epoch": 0.5073529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0649258163660247, + "kl": 0.010252762585878372, + "learning_rate": 9.859134172238128e-07, + "loss": -0.0148, + "num_tokens": 12994340.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.366351842880249, + "sampling/importance_sampling_ratio/mean": 1.000044345855713, + "sampling/importance_sampling_ratio/min": 0.6018763184547424, + "sampling/sampling_logp_difference/max": 0.5077033042907715, + "sampling/sampling_logp_difference/mean": 0.012773063965141773, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 182.234375, + "completions/mean_terminated_length": 182.234375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.32861125469207764, + "epoch": 0.508578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016148268535324674, + "kl": 0.010346755385398865, + "learning_rate": 9.857450191464337e-07, + "loss": 0.0001, + "num_tokens": 13021955.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.527945876121521, + "sampling/importance_sampling_ratio/mean": 0.9997206926345825, + "sampling/importance_sampling_ratio/min": 0.4597110450267792, + "sampling/sampling_logp_difference/max": 0.777157187461853, + "sampling/sampling_logp_difference/mean": 0.01393515057861805, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 236.140625, + "completions/mean_terminated_length": 236.140625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.437416136264801, + "epoch": 0.5098039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7352635655684305, + "kl": 0.008323276415467262, + "learning_rate": 9.855756350374386e-07, + "loss": 0.0104, + "num_tokens": 13065692.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.7883018255233765, + "sampling/importance_sampling_ratio/mean": 0.9998971819877625, + "sampling/importance_sampling_ratio/min": 0.5383554697036743, + "sampling/sampling_logp_difference/max": 0.6192362308502197, + "sampling/sampling_logp_difference/mean": 0.017036153003573418, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 222.28125, + "completions/mean_terminated_length": 222.28125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.43742769956588745, + "epoch": 0.5110294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9121850816629584, + "kl": 0.010742575861513615, + "learning_rate": 9.854052652406665e-07, + "loss": 0.0112, + "num_tokens": 13100894.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.557647466659546, + "sampling/importance_sampling_ratio/mean": 1.0001201629638672, + "sampling/importance_sampling_ratio/min": 0.5368156433105469, + "sampling/sampling_logp_difference/max": 0.6221005916595459, + "sampling/sampling_logp_difference/mean": 0.01638263650238514, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 194.453125, + "completions/mean_terminated_length": 194.453125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3751828074455261, + "epoch": 0.5122549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0845766431346806, + "kl": 0.01224912516772747, + "learning_rate": 9.852339101019572e-07, + "loss": -0.0416, + "num_tokens": 13130507.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.8485195636749268, + "sampling/importance_sampling_ratio/mean": 0.9996465444564819, + "sampling/importance_sampling_ratio/min": 0.5415288805961609, + "sampling/sampling_logp_difference/max": 0.6143851280212402, + "sampling/sampling_logp_difference/mean": 0.015233149752020836, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 203.484375, + "completions/mean_terminated_length": 203.484375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.45921579003334045, + "epoch": 0.5134803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3310712480782592, + "kl": 0.012076552957296371, + "learning_rate": 9.85061569969151e-07, + "loss": -0.03, + "num_tokens": 13164778.0, + "reward": -0.03125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4747954607009888, + "sampling/importance_sampling_ratio/mean": 0.99909508228302, + "sampling/importance_sampling_ratio/min": 0.6616628766059875, + "sampling/sampling_logp_difference/max": 0.41299915313720703, + "sampling/sampling_logp_difference/mean": 0.016422288492321968, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 208.6875, + "completions/mean_terminated_length": 208.6875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3692478835582733, + "epoch": 0.5147058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02518174999162162, + "kl": 0.01171743031591177, + "learning_rate": 9.848882451920875e-07, + "loss": 0.0001, + "num_tokens": 13196502.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.461438536643982, + "sampling/importance_sampling_ratio/mean": 0.9998584985733032, + "sampling/importance_sampling_ratio/min": 0.6298533082008362, + "sampling/sampling_logp_difference/max": 0.4622683525085449, + "sampling/sampling_logp_difference/mean": 0.014890164136886597, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 166.703125, + "completions/mean_terminated_length": 166.703125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.36562401056289673, + "epoch": 0.5159313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1242874525286544, + "kl": 0.015466064214706421, + "learning_rate": 9.847139361226046e-07, + "loss": 0.0056, + "num_tokens": 13221891.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6145374774932861, + "sampling/importance_sampling_ratio/mean": 1.000512719154358, + "sampling/importance_sampling_ratio/min": 0.6652603149414062, + "sampling/sampling_logp_difference/max": 0.479048490524292, + "sampling/sampling_logp_difference/mean": 0.01558524090796709, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 181.3125, + "completions/mean_terminated_length": 181.3125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.42874494194984436, + "epoch": 0.5171568627450981, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4318856251018424, + "kl": 0.022168340161442757, + "learning_rate": 9.84538643114539e-07, + "loss": -0.0212, + "num_tokens": 13245975.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000335454940796, + "sampling/importance_sampling_ratio/min": 0.6910908818244934, + "sampling/sampling_logp_difference/max": 1.2831158638000488, + "sampling/sampling_logp_difference/mean": 0.01832837238907814, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 179.109375, + "completions/mean_terminated_length": 179.109375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3738434314727783, + "epoch": 0.5183823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1146246273592262, + "kl": 0.01757657900452614, + "learning_rate": 9.843623665237242e-07, + "loss": 0.0145, + "num_tokens": 13276766.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4726742506027222, + "sampling/importance_sampling_ratio/mean": 0.999600887298584, + "sampling/importance_sampling_ratio/min": 0.4992694556713104, + "sampling/sampling_logp_difference/max": 0.6946094036102295, + "sampling/sampling_logp_difference/mean": 0.015279553830623627, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 202.3125, + "completions/mean_terminated_length": 202.3125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.32806265354156494, + "epoch": 0.5196078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021441570679582925, + "kl": 0.01352194044739008, + "learning_rate": 9.841851067079908e-07, + "loss": 0.0001, + "num_tokens": 13306210.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4367003440856934, + "sampling/importance_sampling_ratio/mean": 1.0002729892730713, + "sampling/importance_sampling_ratio/min": 0.5905013084411621, + "sampling/sampling_logp_difference/max": 0.5267834663391113, + "sampling/sampling_logp_difference/mean": 0.014424655586481094, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 214.65625, + "completions/mean_terminated_length": 214.65625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.41232380270957947, + "epoch": 0.5208333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9098461013976387, + "kl": 0.020982306450605392, + "learning_rate": 9.840068640271647e-07, + "loss": 0.0202, + "num_tokens": 13334908.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.2892931699752808, + "sampling/importance_sampling_ratio/mean": 1.00017511844635, + "sampling/importance_sampling_ratio/min": 0.6300297975540161, + "sampling/sampling_logp_difference/max": 0.4619882106781006, + "sampling/sampling_logp_difference/mean": 0.015482231974601746, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 181.015625, + "completions/mean_terminated_length": 181.015625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.29491668939590454, + "epoch": 0.5220588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8415739092045318, + "kl": 0.021540533751249313, + "learning_rate": 9.838276388430675e-07, + "loss": 0.0161, + "num_tokens": 13361277.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5539828538894653, + "sampling/importance_sampling_ratio/mean": 1.0003516674041748, + "sampling/importance_sampling_ratio/min": 0.6896569132804871, + "sampling/sampling_logp_difference/max": 0.44082117080688477, + "sampling/sampling_logp_difference/mean": 0.012835600413382053, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 255.484375, + "completions/mean_terminated_length": 255.484375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.49975988268852234, + "epoch": 0.5232843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.650289150718313, + "kl": 0.020539449527859688, + "learning_rate": 9.836474315195147e-07, + "loss": 0.0224, + "num_tokens": 13399964.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.9864652156829834, + "sampling/importance_sampling_ratio/mean": 1.0003230571746826, + "sampling/importance_sampling_ratio/min": 0.6403182148933411, + "sampling/sampling_logp_difference/max": 0.686356782913208, + "sampling/sampling_logp_difference/mean": 0.016608383506536484, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 193.84375, + "completions/mean_terminated_length": 193.84375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.36028361320495605, + "epoch": 0.5245098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9515160665297485, + "kl": 0.03193719685077667, + "learning_rate": 9.83466242422316e-07, + "loss": -0.0123, + "num_tokens": 13427074.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4429755210876465, + "sampling/importance_sampling_ratio/mean": 0.9999572038650513, + "sampling/importance_sampling_ratio/min": 0.5236291289329529, + "sampling/sampling_logp_difference/max": 0.6469717025756836, + "sampling/sampling_logp_difference/mean": 0.01483781449496746, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 257.140625, + "completions/mean_terminated_length": 257.140625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3348478078842163, + "epoch": 0.5257352941176471, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7325311645335302, + "kl": 0.01973658800125122, + "learning_rate": 9.832840719192735e-07, + "loss": -0.0226, + "num_tokens": 13462139.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.477462887763977, + "sampling/importance_sampling_ratio/mean": 0.9998133182525635, + "sampling/importance_sampling_ratio/min": 0.6276203393936157, + "sampling/sampling_logp_difference/max": 0.4658198356628418, + "sampling/sampling_logp_difference/mean": 0.012661002576351166, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 236.0625, + "completions/mean_terminated_length": 236.0625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4094201326370239, + "epoch": 0.5269607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8411620741604495, + "kl": 0.0227787084877491, + "learning_rate": 9.831009203801822e-07, + "loss": -0.0028, + "num_tokens": 13497567.0, + "reward": -0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6725372076034546, + "sampling/importance_sampling_ratio/mean": 1.00007963180542, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.5143417119979858, + "sampling/sampling_logp_difference/mean": 0.013968531042337418, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 283.78125, + "completions/mean_terminated_length": 283.78125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.35234811902046204, + "epoch": 0.5281862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7081715264001555, + "kl": 0.02388506382703781, + "learning_rate": 9.829167881768277e-07, + "loss": 0.0058, + "num_tokens": 13536129.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.542195439338684, + "sampling/importance_sampling_ratio/mean": 1.0000916719436646, + "sampling/importance_sampling_ratio/min": 0.6269434690475464, + "sampling/sampling_logp_difference/max": 0.46689891815185547, + "sampling/sampling_logp_difference/mean": 0.012889344245195389, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 282.828125, + "completions/mean_terminated_length": 282.828125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.49479252099990845, + "epoch": 0.5294117647058824, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.057304346159149, + "kl": 0.020965974777936935, + "learning_rate": 9.82731675682987e-07, + "loss": 0.0221, + "num_tokens": 13574166.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.3233028650283813, + "sampling/importance_sampling_ratio/mean": 0.9997720122337341, + "sampling/importance_sampling_ratio/min": 0.6226791739463806, + "sampling/sampling_logp_difference/max": 0.4737238883972168, + "sampling/sampling_logp_difference/mean": 0.01630120724439621, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 262.046875, + "completions/mean_terminated_length": 262.046875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.49494457244873047, + "epoch": 0.5306372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9025425524831447, + "kl": 0.030203722417354584, + "learning_rate": 9.825455832744266e-07, + "loss": -0.0046, + "num_tokens": 13611705.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5823229551315308, + "sampling/importance_sampling_ratio/mean": 0.999476432800293, + "sampling/importance_sampling_ratio/min": 0.614302396774292, + "sampling/sampling_logp_difference/max": 0.48726797103881836, + "sampling/sampling_logp_difference/mean": 0.015790484845638275, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 330.78125, + "completions/mean_terminated_length": 330.78125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.48657986521720886, + "epoch": 0.5318627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6149870979271623, + "kl": 0.02005343697965145, + "learning_rate": 9.823585113289023e-07, + "loss": 0.0058, + "num_tokens": 13661579.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.2989569902420044, + "sampling/importance_sampling_ratio/mean": 0.999963641166687, + "sampling/importance_sampling_ratio/min": 0.6136351227760315, + "sampling/sampling_logp_difference/max": 0.48835480213165283, + "sampling/sampling_logp_difference/mean": 0.015401165932416916, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 294.6875, + "completions/mean_terminated_length": 294.6875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.4754374921321869, + "epoch": 0.5330882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.76943731528402, + "kl": 0.02414841577410698, + "learning_rate": 9.821704602261585e-07, + "loss": 0.023, + "num_tokens": 13702807.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4890077114105225, + "sampling/importance_sampling_ratio/mean": 0.9997965693473816, + "sampling/importance_sampling_ratio/min": 0.6805422902107239, + "sampling/sampling_logp_difference/max": 0.39810991287231445, + "sampling/sampling_logp_difference/mean": 0.015343626961112022, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 240.75, + "completions/mean_terminated_length": 240.75, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3381650447845459, + "epoch": 0.5343137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03057740845848974, + "kl": 0.02493605762720108, + "learning_rate": 9.819814303479267e-07, + "loss": 0.0002, + "num_tokens": 13734935.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4994944334030151, + "sampling/importance_sampling_ratio/mean": 0.9998176097869873, + "sampling/importance_sampling_ratio/min": 0.6151838302612305, + "sampling/sampling_logp_difference/max": 0.48583412170410156, + "sampling/sampling_logp_difference/mean": 0.012891847640275955, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 250.5625, + "completions/mean_terminated_length": 250.5625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.4581558108329773, + "epoch": 0.5355392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03482387842695008, + "kl": 0.03028777986764908, + "learning_rate": 9.817914220779256e-07, + "loss": 0.0003, + "num_tokens": 13769003.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.389121651649475, + "sampling/importance_sampling_ratio/mean": 0.9999805688858032, + "sampling/importance_sampling_ratio/min": 0.7036060690879822, + "sampling/sampling_logp_difference/max": 0.3515366315841675, + "sampling/sampling_logp_difference/mean": 0.015824012458324432, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 202.328125, + "completions/mean_terminated_length": 202.328125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4181891977787018, + "epoch": 0.5367647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0362809519320139, + "kl": 0.03653764724731445, + "learning_rate": 9.816004358018603e-07, + "loss": 0.0003, + "num_tokens": 13799360.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4336248636245728, + "sampling/importance_sampling_ratio/mean": 1.0002336502075195, + "sampling/importance_sampling_ratio/min": 0.6557809710502625, + "sampling/sampling_logp_difference/max": 0.42192840576171875, + "sampling/sampling_logp_difference/mean": 0.015442028641700745, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 202.75, + "completions/mean_terminated_length": 202.75, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.35558366775512695, + "epoch": 0.5379901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030189696554764472, + "kl": 0.03488951548933983, + "learning_rate": 9.814084719074204e-07, + "loss": 0.0003, + "num_tokens": 13828560.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.621432900428772, + "sampling/importance_sampling_ratio/mean": 0.9999073147773743, + "sampling/importance_sampling_ratio/min": 0.6702842116355896, + "sampling/sampling_logp_difference/max": 0.4833102226257324, + "sampling/sampling_logp_difference/mean": 0.015155116096138954, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 264.4375, + "completions/mean_terminated_length": 264.4375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.43071991205215454, + "epoch": 0.5392156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.591754916786323, + "kl": 0.034047938883304596, + "learning_rate": 9.81215530784281e-07, + "loss": -0.007, + "num_tokens": 13861340.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6207441091537476, + "sampling/importance_sampling_ratio/mean": 1.0001143217086792, + "sampling/importance_sampling_ratio/min": 0.6139618158340454, + "sampling/sampling_logp_difference/max": 0.4878225326538086, + "sampling/sampling_logp_difference/mean": 0.014538413845002651, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 180.078125, + "completions/mean_terminated_length": 180.078125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.36084920167922974, + "epoch": 0.5404411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1432552175845352, + "kl": 0.038981202989816666, + "learning_rate": 9.810216128240996e-07, + "loss": 0.0332, + "num_tokens": 13887153.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5471285581588745, + "sampling/importance_sampling_ratio/mean": 1.0002764463424683, + "sampling/importance_sampling_ratio/min": 0.6482198238372803, + "sampling/sampling_logp_difference/max": 0.4364006519317627, + "sampling/sampling_logp_difference/mean": 0.015175689943134785, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 244.5625, + "completions/mean_terminated_length": 244.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.37251731753349304, + "epoch": 0.5416666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.635142205905668, + "kl": 0.02709903195500374, + "learning_rate": 9.808267184205181e-07, + "loss": 0.0009, + "num_tokens": 13924437.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5991549491882324, + "sampling/importance_sampling_ratio/mean": 1.0000724792480469, + "sampling/importance_sampling_ratio/min": 0.3642270565032959, + "sampling/sampling_logp_difference/max": 1.0099778175354004, + "sampling/sampling_logp_difference/mean": 0.01269014272838831, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 287.84375, + "completions/mean_terminated_length": 287.84375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.5027158260345459, + "epoch": 0.5428921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9928813317044869, + "kl": 0.029346946626901627, + "learning_rate": 9.806308479691594e-07, + "loss": -0.0264, + "num_tokens": 13964091.0, + "reward": -0.1875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.3728333711624146, + "sampling/importance_sampling_ratio/mean": 0.9996798038482666, + "sampling/importance_sampling_ratio/min": 0.45287877321243286, + "sampling/sampling_logp_difference/max": 0.7921308279037476, + "sampling/sampling_logp_difference/mean": 0.015696164220571518, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 228.546875, + "completions/mean_terminated_length": 228.546875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4584943652153015, + "epoch": 0.5441176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7849590758617065, + "kl": 0.02762674354016781, + "learning_rate": 9.80434001867628e-07, + "loss": -0.0096, + "num_tokens": 14003614.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5744317770004272, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.6228858232498169, + "sampling/sampling_logp_difference/max": 0.4733920097351074, + "sampling/sampling_logp_difference/mean": 0.01629599556326866, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 202.859375, + "completions/mean_terminated_length": 202.859375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3557378947734833, + "epoch": 0.5453431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031780399021234605, + "kl": 0.02672041766345501, + "learning_rate": 9.802361805155097e-07, + "loss": 0.0003, + "num_tokens": 14029573.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5874168872833252, + "sampling/importance_sampling_ratio/mean": 0.9998052716255188, + "sampling/importance_sampling_ratio/min": 0.6247994899749756, + "sampling/sampling_logp_difference/max": 0.4703245162963867, + "sampling/sampling_logp_difference/mean": 0.015231041237711906, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 297.71875, + "completions/mean_terminated_length": 297.71875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.4070301055908203, + "epoch": 0.5465686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022448326964289424, + "kl": 0.022481031715869904, + "learning_rate": 9.800373843143683e-07, + "loss": 0.0002, + "num_tokens": 14076371.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6207866668701172, + "sampling/importance_sampling_ratio/mean": 1.0000665187835693, + "sampling/importance_sampling_ratio/min": 0.7068935632705688, + "sampling/sampling_logp_difference/max": 0.4829115867614746, + "sampling/sampling_logp_difference/mean": 0.014261187054216862, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 208.515625, + "completions/mean_terminated_length": 208.515625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4191475212574005, + "epoch": 0.5477941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02705528768846311, + "kl": 0.02209075167775154, + "learning_rate": 9.798376136677484e-07, + "loss": 0.0002, + "num_tokens": 14106676.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3644667863845825, + "sampling/importance_sampling_ratio/mean": 0.9999842643737793, + "sampling/importance_sampling_ratio/min": 0.5770688056945801, + "sampling/sampling_logp_difference/max": 0.5497937202453613, + "sampling/sampling_logp_difference/mean": 0.015202060341835022, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 215.125, + "completions/mean_terminated_length": 215.125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.38473254442214966, + "epoch": 0.5490196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024562931390235142, + "kl": 0.022687768563628197, + "learning_rate": 9.796368689811712e-07, + "loss": 0.0002, + "num_tokens": 14138428.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8606518507003784, + "sampling/importance_sampling_ratio/mean": 0.999741792678833, + "sampling/importance_sampling_ratio/min": 0.6147654056549072, + "sampling/sampling_logp_difference/max": 0.6209268569946289, + "sampling/sampling_logp_difference/mean": 0.0159025676548481, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 190.21875, + "completions/mean_terminated_length": 190.21875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.37754613161087036, + "epoch": 0.5502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.032390692048454, + "kl": 0.0324709452688694, + "learning_rate": 9.79435150662136e-07, + "loss": 0.0366, + "num_tokens": 14163786.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.614363431930542, + "sampling/importance_sampling_ratio/mean": 1.0000146627426147, + "sampling/importance_sampling_ratio/min": 0.6300994753837585, + "sampling/sampling_logp_difference/max": 0.4789407253265381, + "sampling/sampling_logp_difference/mean": 0.015572885051369667, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 253.796875, + "completions/mean_terminated_length": 253.796875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.3987857699394226, + "epoch": 0.5514705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018155648464634834, + "kl": 0.020753910765051842, + "learning_rate": 9.792324591201177e-07, + "loss": 0.0002, + "num_tokens": 14203613.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.42256498336792, + "sampling/importance_sampling_ratio/mean": 1.0002810955047607, + "sampling/importance_sampling_ratio/min": 0.6784428358078003, + "sampling/sampling_logp_difference/max": 0.38795506954193115, + "sampling/sampling_logp_difference/mean": 0.014544595032930374, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 227.046875, + "completions/mean_terminated_length": 227.046875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.42339175939559937, + "epoch": 0.5526960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0838351412506786, + "kl": 0.02783692628145218, + "learning_rate": 9.790287947665681e-07, + "loss": -0.0175, + "num_tokens": 14240032.0, + "reward": -0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.8147363662719727, + "sampling/importance_sampling_ratio/mean": 1.0001912117004395, + "sampling/importance_sampling_ratio/min": 0.5722141861915588, + "sampling/sampling_logp_difference/max": 0.5959402322769165, + "sampling/sampling_logp_difference/mean": 0.016225896775722504, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 217.296875, + "completions/mean_terminated_length": 217.296875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3951900899410248, + "epoch": 0.553921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2519750170992627, + "kl": 0.022853782400488853, + "learning_rate": 9.788241580149122e-07, + "loss": 0.0703, + "num_tokens": 14276403.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004253387451172, + "sampling/importance_sampling_ratio/min": 0.3980225920677185, + "sampling/sampling_logp_difference/max": 0.9212465286254883, + "sampling/sampling_logp_difference/mean": 0.014143247157335281, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 189.140625, + "completions/mean_terminated_length": 189.140625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3912501335144043, + "epoch": 0.5551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9601244698291272, + "kl": 0.02590246871113777, + "learning_rate": 9.786185492805501e-07, + "loss": -0.0114, + "num_tokens": 14303964.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5964021682739258, + "sampling/importance_sampling_ratio/mean": 1.0004160404205322, + "sampling/importance_sampling_ratio/min": 0.7093971967697144, + "sampling/sampling_logp_difference/max": 0.46775245666503906, + "sampling/sampling_logp_difference/mean": 0.015068481676280499, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 182.421875, + "completions/mean_terminated_length": 182.421875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.37030190229415894, + "epoch": 0.5563725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1338304084887485, + "kl": 0.028160633519291878, + "learning_rate": 9.784119689808542e-07, + "loss": 0.0073, + "num_tokens": 14335335.0, + "reward": 0.53125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4796181917190552, + "sampling/importance_sampling_ratio/mean": 1.0000211000442505, + "sampling/importance_sampling_ratio/min": 0.6792578101158142, + "sampling/sampling_logp_difference/max": 0.39178407192230225, + "sampling/sampling_logp_difference/mean": 0.015386087819933891, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 177.765625, + "completions/mean_terminated_length": 177.765625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.34240347146987915, + "epoch": 0.5575980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02345553308197358, + "kl": 0.028486458584666252, + "learning_rate": 9.782044175351699e-07, + "loss": 0.0003, + "num_tokens": 14366856.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.42637038230896, + "sampling/importance_sampling_ratio/mean": 0.9996859431266785, + "sampling/importance_sampling_ratio/min": 0.6485440731048584, + "sampling/sampling_logp_difference/max": 0.4330253601074219, + "sampling/sampling_logp_difference/mean": 0.015499918721616268, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 190.71875, + "completions/mean_terminated_length": 190.71875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.47459185123443604, + "epoch": 0.5588235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0209203187628284, + "kl": 0.031104128807783127, + "learning_rate": 9.779958953648129e-07, + "loss": 0.0363, + "num_tokens": 14398294.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.3172099590301514, + "sampling/importance_sampling_ratio/mean": 0.999106764793396, + "sampling/importance_sampling_ratio/min": 0.6602303981781006, + "sampling/sampling_logp_difference/max": 0.415166437625885, + "sampling/sampling_logp_difference/mean": 0.016539257019758224, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 174.828125, + "completions/mean_terminated_length": 174.828125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.42182838916778564, + "epoch": 0.5600490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.021459722073482, + "kl": 0.02934981882572174, + "learning_rate": 9.777864028930705e-07, + "loss": -0.0026, + "num_tokens": 14425291.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.483134150505066, + "sampling/importance_sampling_ratio/mean": 0.9997996687889099, + "sampling/importance_sampling_ratio/min": 0.6134141683578491, + "sampling/sampling_logp_difference/max": 0.48871493339538574, + "sampling/sampling_logp_difference/mean": 0.017257895320653915, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 164.28125, + "completions/mean_terminated_length": 164.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.2947123050689697, + "epoch": 0.5612745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9367905375406443, + "kl": 0.032778844237327576, + "learning_rate": 9.775759405451986e-07, + "loss": -0.0105, + "num_tokens": 14456493.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4753822088241577, + "sampling/importance_sampling_ratio/mean": 1.00032639503479, + "sampling/importance_sampling_ratio/min": 0.7047656178474426, + "sampling/sampling_logp_difference/max": 0.38891708850860596, + "sampling/sampling_logp_difference/mean": 0.01345054805278778, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.417305052280426, + "epoch": 0.5625, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0784437365504824, + "kl": 0.036820050328969955, + "learning_rate": 9.773645087484228e-07, + "loss": 0.0099, + "num_tokens": 14484567.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.380178689956665, + "sampling/importance_sampling_ratio/mean": 0.9998287558555603, + "sampling/importance_sampling_ratio/min": 0.6753222942352295, + "sampling/sampling_logp_difference/max": 0.3925652503967285, + "sampling/sampling_logp_difference/mean": 0.017081189900636673, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 172.875, + "completions/mean_terminated_length": 172.875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.36164188385009766, + "epoch": 0.5637254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4720842033761148, + "kl": 0.03700194135308266, + "learning_rate": 9.771521079319363e-07, + "loss": 0.0037, + "num_tokens": 14516623.0, + "reward": 0.84375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.571526050567627, + "sampling/importance_sampling_ratio/mean": 1.0001332759857178, + "sampling/importance_sampling_ratio/min": 0.4859488904476166, + "sampling/sampling_logp_difference/max": 0.7216517925262451, + "sampling/sampling_logp_difference/mean": 0.014842424541711807, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 179.25, + "completions/mean_terminated_length": 179.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.40478748083114624, + "epoch": 0.5649509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9782248174795586, + "kl": 0.03830481320619583, + "learning_rate": 9.76938738526899e-07, + "loss": -0.0054, + "num_tokens": 14549359.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6708135604858398, + "sampling/importance_sampling_ratio/mean": 1.000866174697876, + "sampling/importance_sampling_ratio/min": 0.5204440951347351, + "sampling/sampling_logp_difference/max": 0.6530728340148926, + "sampling/sampling_logp_difference/mean": 0.017827894538640976, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 174.765625, + "completions/mean_terminated_length": 174.765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3068470358848572, + "epoch": 0.5661764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031222083588003047, + "kl": 0.028238721191883087, + "learning_rate": 9.767244009664376e-07, + "loss": 0.0003, + "num_tokens": 14583200.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4348357915878296, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.6481047868728638, + "sampling/sampling_logp_difference/max": 0.4337029457092285, + "sampling/sampling_logp_difference/mean": 0.012765954248607159, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 188.0625, + "completions/mean_terminated_length": 188.0625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.40620073676109314, + "epoch": 0.5674019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7900803798163197, + "kl": 0.03524962067604065, + "learning_rate": 9.765090956856435e-07, + "loss": -0.0037, + "num_tokens": 14613636.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4808640480041504, + "sampling/importance_sampling_ratio/mean": 0.9999431371688843, + "sampling/importance_sampling_ratio/min": 0.6116536855697632, + "sampling/sampling_logp_difference/max": 0.4915890693664551, + "sampling/sampling_logp_difference/mean": 0.01594621129333973, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 204.609375, + "completions/mean_terminated_length": 204.609375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.4412503242492676, + "epoch": 0.5686274509803921, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.466674119828152, + "kl": 0.043802544474601746, + "learning_rate": 9.76292823121573e-07, + "loss": 0.0358, + "num_tokens": 14647179.0, + "reward": 0.71875, + "reward_std": 0.565913200378418, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.9443401098251343, + "sampling/importance_sampling_ratio/mean": 1.0002394914627075, + "sampling/importance_sampling_ratio/min": 0.6217594146728516, + "sampling/sampling_logp_difference/max": 0.6649227142333984, + "sampling/sampling_logp_difference/mean": 0.01716477796435356, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 216.15625, + "completions/mean_terminated_length": 216.15625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4255213439464569, + "epoch": 0.5698529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.138903439348918, + "kl": 0.03624618798494339, + "learning_rate": 9.760755837132457e-07, + "loss": 0.024, + "num_tokens": 14686613.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4094603061676025, + "sampling/importance_sampling_ratio/mean": 1.000252604484558, + "sampling/importance_sampling_ratio/min": 0.3503970205783844, + "sampling/sampling_logp_difference/max": 1.0486884117126465, + "sampling/sampling_logp_difference/mean": 0.015629207715392113, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 131.125, + "completions/mean_terminated_length": 131.125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.35965681076049805, + "epoch": 0.571078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1053258217089428, + "kl": 0.04981109872460365, + "learning_rate": 9.758573779016436e-07, + "loss": 0.0045, + "num_tokens": 14707037.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.3779658079147339, + "sampling/importance_sampling_ratio/mean": 0.9993205666542053, + "sampling/importance_sampling_ratio/min": 0.6660267114639282, + "sampling/sampling_logp_difference/max": 0.40642547607421875, + "sampling/sampling_logp_difference/mean": 0.01673746109008789, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 173.90625, + "completions/mean_terminated_length": 173.90625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.36547723412513733, + "epoch": 0.5723039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0466216720434813, + "kl": 0.0352628119289875, + "learning_rate": 9.75638206129711e-07, + "loss": -0.0227, + "num_tokens": 14733463.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.622634768486023, + "sampling/importance_sampling_ratio/mean": 1.0002810955047607, + "sampling/importance_sampling_ratio/min": 0.7660991549491882, + "sampling/sampling_logp_difference/max": 0.4840512275695801, + "sampling/sampling_logp_difference/mean": 0.015148546546697617, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 178.28125, + "completions/mean_terminated_length": 178.28125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.5210790634155273, + "epoch": 0.5735294117647058, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2991492167785403, + "kl": 0.04535311460494995, + "learning_rate": 9.754180688423524e-07, + "loss": -0.0156, + "num_tokens": 14763913.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4715548753738403, + "sampling/importance_sampling_ratio/mean": 1.000481128692627, + "sampling/importance_sampling_ratio/min": 0.6172164082527161, + "sampling/sampling_logp_difference/max": 0.48253560066223145, + "sampling/sampling_logp_difference/mean": 0.01772424206137657, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 182.046875, + "completions/mean_terminated_length": 182.046875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.46314379572868347, + "epoch": 0.5747549019607843, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.9744322865687498, + "kl": 0.05717692896723747, + "learning_rate": 9.751969664864326e-07, + "loss": -0.0096, + "num_tokens": 14794892.0, + "reward": 0.46875, + "reward_std": 0.5281128883361816, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5310888290405273, + "sampling/importance_sampling_ratio/mean": 1.0001671314239502, + "sampling/importance_sampling_ratio/min": 0.6671816110610962, + "sampling/sampling_logp_difference/max": 0.4259791374206543, + "sampling/sampling_logp_difference/mean": 0.016762029379606247, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 189.65625, + "completions/mean_terminated_length": 189.65625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.48477575182914734, + "epoch": 0.5759803921568627, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3977615783156871, + "kl": 0.04280099272727966, + "learning_rate": 9.749748995107756e-07, + "loss": 0.0007, + "num_tokens": 14824502.0, + "reward": 0.46875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997283816337585, + "sampling/importance_sampling_ratio/min": 0.687197208404541, + "sampling/sampling_logp_difference/max": 0.7038180828094482, + "sampling/sampling_logp_difference/mean": 0.016724707558751106, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 183.1875, + "completions/mean_terminated_length": 183.1875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3991392254829407, + "epoch": 0.5772058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032043962557941105, + "kl": 0.039326563477516174, + "learning_rate": 9.74751868366163e-07, + "loss": 0.0004, + "num_tokens": 14854210.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3368841409683228, + "sampling/importance_sampling_ratio/mean": 1.0000734329223633, + "sampling/importance_sampling_ratio/min": 0.6670835614204407, + "sampling/sampling_logp_difference/max": 0.40483999252319336, + "sampling/sampling_logp_difference/mean": 0.013921466656029224, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 229.921875, + "completions/mean_terminated_length": 229.921875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.48380720615386963, + "epoch": 0.5784313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7879687250330961, + "kl": 0.03630056232213974, + "learning_rate": 9.745278735053343e-07, + "loss": 0.0047, + "num_tokens": 14897197.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.83116614818573, + "sampling/importance_sampling_ratio/mean": 1.0003652572631836, + "sampling/importance_sampling_ratio/min": 0.3702128529548645, + "sampling/sampling_logp_difference/max": 0.9936771392822266, + "sampling/sampling_logp_difference/mean": 0.016720673069357872, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 181.9375, + "completions/mean_terminated_length": 181.9375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.40632766485214233, + "epoch": 0.5796568627450981, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5074305833058312, + "kl": 0.04674538969993591, + "learning_rate": 9.743029153829845e-07, + "loss": -0.0353, + "num_tokens": 14928265.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.527808427810669, + "sampling/importance_sampling_ratio/mean": 0.9999127984046936, + "sampling/importance_sampling_ratio/min": 0.6771559119224548, + "sampling/sampling_logp_difference/max": 0.42383432388305664, + "sampling/sampling_logp_difference/mean": 0.015889152884483337, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 165.328125, + "completions/mean_terminated_length": 165.328125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.4638434648513794, + "epoch": 0.5808823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0500186038726147, + "kl": 0.039422713220119476, + "learning_rate": 9.740769944557644e-07, + "loss": 0.0184, + "num_tokens": 14962702.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6118754148483276, + "sampling/importance_sampling_ratio/mean": 0.9996562600135803, + "sampling/importance_sampling_ratio/min": 0.6100545525550842, + "sampling/sampling_logp_difference/max": 0.49420690536499023, + "sampling/sampling_logp_difference/mean": 0.01637447066605091, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 195.546875, + "completions/mean_terminated_length": 195.546875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.41811755299568176, + "epoch": 0.5821078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8360398958246319, + "kl": 0.039523884654045105, + "learning_rate": 9.738501111822792e-07, + "loss": -0.0022, + "num_tokens": 14997889.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.536515712738037, + "sampling/importance_sampling_ratio/mean": 0.999782383441925, + "sampling/importance_sampling_ratio/min": 0.5495277643203735, + "sampling/sampling_logp_difference/max": 0.5986959934234619, + "sampling/sampling_logp_difference/mean": 0.015316365286707878, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 149.59375, + "completions/mean_terminated_length": 149.59375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.29053980112075806, + "epoch": 0.5833333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03875701398066368, + "kl": 0.0342644602060318, + "learning_rate": 9.736222660230878e-07, + "loss": 0.0003, + "num_tokens": 15029351.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6604468822479248, + "sampling/importance_sampling_ratio/mean": 1.0005455017089844, + "sampling/importance_sampling_ratio/min": 0.5771161913871765, + "sampling/sampling_logp_difference/max": 0.5497117042541504, + "sampling/sampling_logp_difference/mean": 0.014183287508785725, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 200.4375, + "completions/mean_terminated_length": 200.4375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4234068989753723, + "epoch": 0.5845588235294118, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8786220431503255, + "kl": 0.03679296374320984, + "learning_rate": 9.73393459440701e-07, + "loss": -0.0214, + "num_tokens": 15062067.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.574912667274475, + "sampling/importance_sampling_ratio/mean": 1.0002837181091309, + "sampling/importance_sampling_ratio/min": 0.6802206039428711, + "sampling/sampling_logp_difference/max": 0.45419979095458984, + "sampling/sampling_logp_difference/mean": 0.017016585916280746, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 200.90625, + "completions/mean_terminated_length": 200.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4000805914402008, + "epoch": 0.5857843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032605977714361904, + "kl": 0.03804188221693039, + "learning_rate": 9.73163691899582e-07, + "loss": 0.0004, + "num_tokens": 15095037.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4352234601974487, + "sampling/importance_sampling_ratio/mean": 0.9999063014984131, + "sampling/importance_sampling_ratio/min": 0.652175784111023, + "sampling/sampling_logp_difference/max": 0.4274411201477051, + "sampling/sampling_logp_difference/mean": 0.015267307870090008, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 130.78125, + "completions/mean_terminated_length": 130.78125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3270447254180908, + "epoch": 0.5870098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0818031132066426, + "kl": 0.055767759680747986, + "learning_rate": 9.729329638661444e-07, + "loss": 0.0055, + "num_tokens": 15121471.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3264274597167969, + "sampling/importance_sampling_ratio/mean": 0.9994399547576904, + "sampling/importance_sampling_ratio/min": 0.6381850838661194, + "sampling/sampling_logp_difference/max": 0.4491269588470459, + "sampling/sampling_logp_difference/mean": 0.015719007700681686, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 231.59375, + "completions/mean_terminated_length": 231.59375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3547309637069702, + "epoch": 0.5882352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04431702110735289, + "kl": 0.039026688784360886, + "learning_rate": 9.727012758087512e-07, + "loss": 0.0004, + "num_tokens": 15155941.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.507184624671936, + "sampling/importance_sampling_ratio/mean": 1.0004100799560547, + "sampling/importance_sampling_ratio/min": 0.6588627099990845, + "sampling/sampling_logp_difference/max": 0.4172401428222656, + "sampling/sampling_logp_difference/mean": 0.013316002674400806, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 219.0, + "completions/mean_terminated_length": 219.0, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.39464816451072693, + "epoch": 0.5894607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028339915855737342, + "kl": 0.04644651710987091, + "learning_rate": 9.724686281977146e-07, + "loss": 0.0004, + "num_tokens": 15190197.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.36279296875, + "sampling/importance_sampling_ratio/mean": 0.9995285272598267, + "sampling/importance_sampling_ratio/min": 0.6821646690368652, + "sampling/sampling_logp_difference/max": 0.38248419761657715, + "sampling/sampling_logp_difference/mean": 0.014964552596211433, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 188.875, + "completions/mean_terminated_length": 188.875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.43031764030456543, + "epoch": 0.5906862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1476481998532106, + "kl": 0.053885385394096375, + "learning_rate": 9.722350215052946e-07, + "loss": 0.0313, + "num_tokens": 15224717.0, + "reward": 0.3125, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.634546160697937, + "sampling/importance_sampling_ratio/mean": 1.0000656843185425, + "sampling/importance_sampling_ratio/min": 0.6249037981033325, + "sampling/sampling_logp_difference/max": 0.4913651943206787, + "sampling/sampling_logp_difference/mean": 0.016320811584591866, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 173.953125, + "completions/mean_terminated_length": 173.953125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3014456629753113, + "epoch": 0.5919117647058824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03749685995290343, + "kl": 0.04681577906012535, + "learning_rate": 9.720004562056979e-07, + "loss": 0.0004, + "num_tokens": 15253738.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.418946623802185, + "sampling/importance_sampling_ratio/mean": 1.0003348588943481, + "sampling/importance_sampling_ratio/min": 0.6374188661575317, + "sampling/sampling_logp_difference/max": 0.45032835006713867, + "sampling/sampling_logp_difference/mean": 0.013938084244728088, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 187.359375, + "completions/mean_terminated_length": 187.359375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.37069573998451233, + "epoch": 0.5931372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07186512216651662, + "kl": 0.04863576591014862, + "learning_rate": 9.717649327750773e-07, + "loss": 0.0005, + "num_tokens": 15284929.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6029527187347412, + "sampling/importance_sampling_ratio/mean": 1.0000958442687988, + "sampling/importance_sampling_ratio/min": 0.6163931488990784, + "sampling/sampling_logp_difference/max": 0.483870267868042, + "sampling/sampling_logp_difference/mean": 0.01496157981455326, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.31721800565719604, + "epoch": 0.5943627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025045108273247088, + "kl": 0.03179796040058136, + "learning_rate": 9.7152845169153e-07, + "loss": 0.0003, + "num_tokens": 15314219.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3244808912277222, + "sampling/importance_sampling_ratio/mean": 0.9997227191925049, + "sampling/importance_sampling_ratio/min": 0.6771866083145142, + "sampling/sampling_logp_difference/max": 0.38980841636657715, + "sampling/sampling_logp_difference/mean": 0.013248606584966183, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 226.078125, + "completions/mean_terminated_length": 226.078125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.4020075798034668, + "epoch": 0.5955882352941176, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.865404599052011, + "kl": 0.03322160243988037, + "learning_rate": 9.712910134350984e-07, + "loss": -0.0068, + "num_tokens": 15346192.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5245110988616943, + "sampling/importance_sampling_ratio/mean": 0.9994904398918152, + "sampling/importance_sampling_ratio/min": 0.6649379134178162, + "sampling/sampling_logp_difference/max": 0.4216737747192383, + "sampling/sampling_logp_difference/mean": 0.01634867489337921, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 165.921875, + "completions/mean_terminated_length": 165.921875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.32319891452789307, + "epoch": 0.5968137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05301787512221058, + "kl": 0.044561855494976044, + "learning_rate": 9.710526184877666e-07, + "loss": 0.0004, + "num_tokens": 15371083.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5076807737350464, + "sampling/importance_sampling_ratio/mean": 0.9999871253967285, + "sampling/importance_sampling_ratio/min": 0.6214069724082947, + "sampling/sampling_logp_difference/max": 0.47576904296875, + "sampling/sampling_logp_difference/mean": 0.015059342607855797, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3339904248714447, + "epoch": 0.5980392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9118170726402404, + "kl": 0.049351539462804794, + "learning_rate": 9.708132673334615e-07, + "loss": 0.0067, + "num_tokens": 15397555.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3671423196792603, + "sampling/importance_sampling_ratio/mean": 1.0000923871994019, + "sampling/importance_sampling_ratio/min": 0.5302200317382812, + "sampling/sampling_logp_difference/max": 0.6344633102416992, + "sampling/sampling_logp_difference/mean": 0.0142977274954319, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 217.171875, + "completions/mean_terminated_length": 217.171875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.369615375995636, + "epoch": 0.5992647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8277067553657047, + "kl": 0.032596759498119354, + "learning_rate": 9.705729604580505e-07, + "loss": -0.0057, + "num_tokens": 15428302.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5348131656646729, + "sampling/importance_sampling_ratio/mean": 0.9999060034751892, + "sampling/importance_sampling_ratio/min": 0.4914447069168091, + "sampling/sampling_logp_difference/max": 0.7104058265686035, + "sampling/sampling_logp_difference/mean": 0.014864052645862103, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 197.1875, + "completions/mean_terminated_length": 197.1875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.30024808645248413, + "epoch": 0.6004901960784313, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.791695814136919, + "kl": 0.04219331592321396, + "learning_rate": 9.703316983493412e-07, + "loss": -0.0198, + "num_tokens": 15457626.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6603162288665771, + "sampling/importance_sampling_ratio/mean": 1.0006078481674194, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.5070080757141113, + "sampling/sampling_logp_difference/mean": 0.013717526569962502, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 218.0, + "completions/mean_terminated_length": 218.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3099389374256134, + "epoch": 0.6017156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.022027601431333, + "kl": 0.04899393394589424, + "learning_rate": 9.700894814970808e-07, + "loss": 0.0799, + "num_tokens": 15486074.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5108519792556763, + "sampling/importance_sampling_ratio/mean": 1.0001444816589355, + "sampling/importance_sampling_ratio/min": 0.7161623239517212, + "sampling/sampling_logp_difference/max": 0.4126737117767334, + "sampling/sampling_logp_difference/mean": 0.01254919171333313, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 192.21875, + "completions/mean_terminated_length": 192.21875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.32838335633277893, + "epoch": 0.6029411764705882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027043875500573203, + "kl": 0.03538636118173599, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0004, + "num_tokens": 15514776.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3303242921829224, + "sampling/importance_sampling_ratio/mean": 1.000047206878662, + "sampling/importance_sampling_ratio/min": 0.7300384640693665, + "sampling/sampling_logp_difference/max": 0.3146580457687378, + "sampling/sampling_logp_difference/mean": 0.012933471240103245, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 164.21875, + "completions/mean_terminated_length": 164.21875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.31112101674079895, + "epoch": 0.6041666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03344291584823396, + "kl": 0.041676055639982224, + "learning_rate": 9.69602185530583e-07, + "loss": 0.0004, + "num_tokens": 15543478.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5971465110778809, + "sampling/importance_sampling_ratio/mean": 0.9997182488441467, + "sampling/importance_sampling_ratio/min": 0.6483074426651001, + "sampling/sampling_logp_difference/max": 0.4682185649871826, + "sampling/sampling_logp_difference/mean": 0.013913290575146675, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 201.703125, + "completions/mean_terminated_length": 201.703125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.40549153089523315, + "epoch": 0.6053921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9591883322297038, + "kl": 0.031131578609347343, + "learning_rate": 9.693571074055254e-07, + "loss": 0.0042, + "num_tokens": 15572531.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4643051624298096, + "sampling/importance_sampling_ratio/mean": 0.9999620318412781, + "sampling/importance_sampling_ratio/min": 0.5960602164268494, + "sampling/sampling_logp_difference/max": 0.5174136161804199, + "sampling/sampling_logp_difference/mean": 0.016925688832998276, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 199.671875, + "completions/mean_terminated_length": 199.671875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.36816954612731934, + "epoch": 0.6066176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9265582481618314, + "kl": 0.04405049607157707, + "learning_rate": 9.691110765152744e-07, + "loss": -0.0131, + "num_tokens": 15603278.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6700979471206665, + "sampling/importance_sampling_ratio/mean": 1.0002562999725342, + "sampling/importance_sampling_ratio/min": 0.612542986869812, + "sampling/sampling_logp_difference/max": 0.5128822326660156, + "sampling/sampling_logp_difference/mean": 0.01636900007724762, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 234.359375, + "completions/mean_terminated_length": 234.359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.28878411650657654, + "epoch": 0.6078431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01999654005497948, + "kl": 0.027198560535907745, + "learning_rate": 9.688640933592572e-07, + "loss": 0.0003, + "num_tokens": 15633573.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4348645210266113, + "sampling/importance_sampling_ratio/mean": 1.00016188621521, + "sampling/importance_sampling_ratio/min": 0.6262774467468262, + "sampling/sampling_logp_difference/max": 0.46796178817749023, + "sampling/sampling_logp_difference/mean": 0.011264925822615623, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 270.0, + "completions/mean_terminated_length": 270.0, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.3872169256210327, + "epoch": 0.6090686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030476085308733487, + "kl": 0.027259351685643196, + "learning_rate": 9.686161584388339e-07, + "loss": 0.0003, + "num_tokens": 15667861.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6089355945587158, + "sampling/importance_sampling_ratio/mean": 0.9999793171882629, + "sampling/importance_sampling_ratio/min": 0.4886479079723358, + "sampling/sampling_logp_difference/max": 0.7161130905151367, + "sampling/sampling_logp_difference/mean": 0.015679148957133293, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 220.609375, + "completions/mean_terminated_length": 220.609375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.4441712200641632, + "epoch": 0.6102941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02724183738489788, + "kl": 0.03278857469558716, + "learning_rate": 9.683672722572966e-07, + "loss": 0.0003, + "num_tokens": 15698796.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4401459693908691, + "sampling/importance_sampling_ratio/mean": 1.0000853538513184, + "sampling/importance_sampling_ratio/min": 0.6875020265579224, + "sampling/sampling_logp_difference/max": 0.37469053268432617, + "sampling/sampling_logp_difference/mean": 0.016673587262630463, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 232.828125, + "completions/mean_terminated_length": 232.828125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.35596445202827454, + "epoch": 0.6115196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03127872159983417, + "kl": 0.030974477529525757, + "learning_rate": 9.681174353198686e-07, + "loss": 0.0003, + "num_tokens": 15731729.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.479277491569519, + "sampling/importance_sampling_ratio/mean": 0.9996037483215332, + "sampling/importance_sampling_ratio/min": 0.6136996150016785, + "sampling/sampling_logp_difference/max": 0.4882497787475586, + "sampling/sampling_logp_difference/mean": 0.014733528718352318, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 178.984375, + "completions/mean_terminated_length": 178.984375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.39300116896629333, + "epoch": 0.6127450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03591396388828622, + "kl": 0.03897593915462494, + "learning_rate": 9.678666481337031e-07, + "loss": 0.0004, + "num_tokens": 15761648.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5277884006500244, + "sampling/importance_sampling_ratio/mean": 0.9996728301048279, + "sampling/importance_sampling_ratio/min": 0.7004489302635193, + "sampling/sampling_logp_difference/max": 0.42382121086120605, + "sampling/sampling_logp_difference/mean": 0.016662370413541794, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 267.265625, + "completions/mean_terminated_length": 267.265625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.3144611716270447, + "epoch": 0.6139705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.749493892103405, + "kl": 0.02991885133087635, + "learning_rate": 9.67614911207882e-07, + "loss": 0.0046, + "num_tokens": 15795393.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3558284044265747, + "sampling/importance_sampling_ratio/mean": 1.0002613067626953, + "sampling/importance_sampling_ratio/min": 0.6464096307754517, + "sampling/sampling_logp_difference/max": 0.4363219738006592, + "sampling/sampling_logp_difference/mean": 0.013599826022982597, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 276.828125, + "completions/mean_terminated_length": 276.828125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.2665802836418152, + "epoch": 0.6151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017226321367376545, + "kl": 0.021591052412986755, + "learning_rate": 9.673622250534155e-07, + "loss": 0.0002, + "num_tokens": 15832678.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.465712070465088, + "sampling/importance_sampling_ratio/mean": 1.0006533861160278, + "sampling/importance_sampling_ratio/min": 0.6309396624565125, + "sampling/sampling_logp_difference/max": 0.46054506301879883, + "sampling/sampling_logp_difference/mean": 0.011435139924287796, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 150.375, + "completions/mean_terminated_length": 150.375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2680723965167999, + "epoch": 0.616421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06538297599717596, + "kl": 0.04390225559473038, + "learning_rate": 9.671085901832404e-07, + "loss": 0.0004, + "num_tokens": 15855342.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5716720819473267, + "sampling/importance_sampling_ratio/mean": 0.9990843534469604, + "sampling/importance_sampling_ratio/min": 0.617487370967865, + "sampling/sampling_logp_difference/max": 0.48209667205810547, + "sampling/sampling_logp_difference/mean": 0.01410925853997469, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 226.421875, + "completions/mean_terminated_length": 226.421875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2998732924461365, + "epoch": 0.6176470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028155419050621314, + "kl": 0.027188263833522797, + "learning_rate": 9.668540071122195e-07, + "loss": 0.0003, + "num_tokens": 15885801.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5242469310760498, + "sampling/importance_sampling_ratio/mean": 0.9993939399719238, + "sampling/importance_sampling_ratio/min": 0.623637855052948, + "sampling/sampling_logp_difference/max": 0.4721853733062744, + "sampling/sampling_logp_difference/mean": 0.015196947380900383, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 229.171875, + "completions/mean_terminated_length": 229.171875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.33026987314224243, + "epoch": 0.6188725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.905760075770604, + "kl": 0.030161835253238678, + "learning_rate": 9.665984763571402e-07, + "loss": 0.011, + "num_tokens": 15918836.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4756348133087158, + "sampling/importance_sampling_ratio/mean": 1.0005979537963867, + "sampling/importance_sampling_ratio/min": 0.5976086854934692, + "sampling/sampling_logp_difference/max": 0.5148191452026367, + "sampling/sampling_logp_difference/mean": 0.013623598031699657, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 208.625, + "completions/mean_terminated_length": 208.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.35385337471961975, + "epoch": 0.6200980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041170782730491726, + "kl": 0.0330902636051178, + "learning_rate": 9.663419984367137e-07, + "loss": 0.0003, + "num_tokens": 15951116.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8479071855545044, + "sampling/importance_sampling_ratio/mean": 1.0002179145812988, + "sampling/importance_sampling_ratio/min": 0.6587467789649963, + "sampling/sampling_logp_difference/max": 0.6140537261962891, + "sampling/sampling_logp_difference/mean": 0.014573907479643822, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 227.015625, + "completions/mean_terminated_length": 227.015625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3508400321006775, + "epoch": 0.6213235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02524040177034461, + "kl": 0.027214359492063522, + "learning_rate": 9.660845738715742e-07, + "loss": 0.0003, + "num_tokens": 15981533.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9693646430969238, + "sampling/importance_sampling_ratio/mean": 0.9999514818191528, + "sampling/importance_sampling_ratio/min": 0.6059155464172363, + "sampling/sampling_logp_difference/max": 0.677711009979248, + "sampling/sampling_logp_difference/mean": 0.01470345351845026, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 238.421875, + "completions/mean_terminated_length": 238.421875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4612578749656677, + "epoch": 0.6225490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7956611667635086, + "kl": 0.0296800397336483, + "learning_rate": 9.658262031842769e-07, + "loss": 0.0164, + "num_tokens": 16015192.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5379420518875122, + "sampling/importance_sampling_ratio/mean": 1.0001564025878906, + "sampling/importance_sampling_ratio/min": 0.3161124885082245, + "sampling/sampling_logp_difference/max": 1.1516571044921875, + "sampling/sampling_logp_difference/mean": 0.015863286331295967, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.34082987904548645, + "epoch": 0.6237745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030028764226007094, + "kl": 0.03355834260582924, + "learning_rate": 9.655668868992983e-07, + "loss": 0.0003, + "num_tokens": 16048516.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5475269556045532, + "sampling/importance_sampling_ratio/mean": 1.0004501342773438, + "sampling/importance_sampling_ratio/min": 0.6653298139572144, + "sampling/sampling_logp_difference/max": 0.4366581439971924, + "sampling/sampling_logp_difference/mean": 0.014498144388198853, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 168.8125, + "completions/mean_terminated_length": 168.8125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.28656935691833496, + "epoch": 0.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03789431316473932, + "kl": 0.029552018269896507, + "learning_rate": 9.653066255430338e-07, + "loss": 0.0003, + "num_tokens": 16074808.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4943023920059204, + "sampling/importance_sampling_ratio/mean": 0.999640703201294, + "sampling/importance_sampling_ratio/min": 0.5432738065719604, + "sampling/sampling_logp_difference/max": 0.6101418733596802, + "sampling/sampling_logp_difference/mean": 0.013910012319684029, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 249.71875, + "completions/mean_terminated_length": 249.71875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3192594051361084, + "epoch": 0.6262254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018134661715106467, + "kl": 0.021534759551286697, + "learning_rate": 9.650454196437973e-07, + "loss": 0.0002, + "num_tokens": 16106294.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4958059787750244, + "sampling/importance_sampling_ratio/mean": 1.0002377033233643, + "sampling/importance_sampling_ratio/min": 0.718105137348175, + "sampling/sampling_logp_difference/max": 0.40266525745391846, + "sampling/sampling_logp_difference/mean": 0.013101302087306976, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 245.171875, + "completions/mean_terminated_length": 245.171875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.32846811413764954, + "epoch": 0.6274509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6441520351435096, + "kl": 0.025811253115534782, + "learning_rate": 9.647832697318206e-07, + "loss": -0.027, + "num_tokens": 16141777.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4265540838241577, + "sampling/importance_sampling_ratio/mean": 1.0003132820129395, + "sampling/importance_sampling_ratio/min": 0.5689274668693542, + "sampling/sampling_logp_difference/max": 0.564002275466919, + "sampling/sampling_logp_difference/mean": 0.013478966429829597, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 241.71875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3434150815010071, + "epoch": 0.6286764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02379823909954446, + "kl": 0.029206233099102974, + "learning_rate": 9.645201763392513e-07, + "loss": 0.0003, + "num_tokens": 16175023.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5291852951049805, + "sampling/importance_sampling_ratio/mean": 1.0002974271774292, + "sampling/importance_sampling_ratio/min": 0.6547035574913025, + "sampling/sampling_logp_difference/max": 0.42473506927490234, + "sampling/sampling_logp_difference/mean": 0.013717292807996273, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 155.875, + "completions/mean_terminated_length": 155.875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3104274272918701, + "epoch": 0.6299019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03345929852069136, + "kl": 0.03869575262069702, + "learning_rate": 9.64256140000152e-07, + "loss": 0.0004, + "num_tokens": 16203639.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5091041326522827, + "sampling/importance_sampling_ratio/mean": 0.9998499155044556, + "sampling/importance_sampling_ratio/min": 0.6605297327041626, + "sampling/sampling_logp_difference/max": 0.41471314430236816, + "sampling/sampling_logp_difference/mean": 0.014581705443561077, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 184.5625, + "completions/mean_terminated_length": 184.5625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3684101402759552, + "epoch": 0.6311274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02124262357646272, + "kl": 0.026777107268571854, + "learning_rate": 9.639911612505003e-07, + "loss": 0.0003, + "num_tokens": 16236283.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4286984205245972, + "sampling/importance_sampling_ratio/mean": 0.9999425411224365, + "sampling/importance_sampling_ratio/min": 0.6769490242004395, + "sampling/sampling_logp_difference/max": 0.39015936851501465, + "sampling/sampling_logp_difference/mean": 0.014349126257002354, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 202.25, + "completions/mean_terminated_length": 202.25, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3284221291542053, + "epoch": 0.6323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030142720478342194, + "kl": 0.031212424859404564, + "learning_rate": 9.63725240628186e-07, + "loss": 0.0003, + "num_tokens": 16267115.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3817118406295776, + "sampling/importance_sampling_ratio/mean": 0.9998214840888977, + "sampling/importance_sampling_ratio/min": 0.5461640357971191, + "sampling/sampling_logp_difference/max": 0.6048359870910645, + "sampling/sampling_logp_difference/mean": 0.013921651989221573, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 182.609375, + "completions/mean_terminated_length": 182.609375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4168780744075775, + "epoch": 0.633578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026171104157375946, + "kl": 0.03219608962535858, + "learning_rate": 9.634583786730108e-07, + "loss": 0.0003, + "num_tokens": 16297090.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.764905333518982, + "sampling/importance_sampling_ratio/mean": 1.0003851652145386, + "sampling/importance_sampling_ratio/min": 0.6018215417861938, + "sampling/sampling_logp_difference/max": 0.5680971145629883, + "sampling/sampling_logp_difference/mean": 0.016892993822693825, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 243.0, + "completions/mean_terminated_length": 243.0, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3474350869655609, + "epoch": 0.6348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022862287577593118, + "kl": 0.02698478288948536, + "learning_rate": 9.63190575926688e-07, + "loss": 0.0003, + "num_tokens": 16330642.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5744448900222778, + "sampling/importance_sampling_ratio/mean": 0.9994927644729614, + "sampling/importance_sampling_ratio/min": 0.732742965221405, + "sampling/sampling_logp_difference/max": 0.4539027214050293, + "sampling/sampling_logp_difference/mean": 0.013669499196112156, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 241.859375, + "completions/mean_terminated_length": 241.859375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4381487965583801, + "epoch": 0.6360294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7083376676619356, + "kl": 0.026370543986558914, + "learning_rate": 9.6292183293284e-07, + "loss": 0.0019, + "num_tokens": 16365689.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3973273038864136, + "sampling/importance_sampling_ratio/mean": 1.0000078678131104, + "sampling/importance_sampling_ratio/min": 0.6761813163757324, + "sampling/sampling_logp_difference/max": 0.391294002532959, + "sampling/sampling_logp_difference/mean": 0.013794781640172005, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 268.21875, + "completions/mean_terminated_length": 268.21875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.3459845185279846, + "epoch": 0.6372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018214372495190298, + "kl": 0.025208748877048492, + "learning_rate": 9.626521502369983e-07, + "loss": 0.0002, + "num_tokens": 16396423.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6887457370758057, + "sampling/importance_sampling_ratio/mean": 1.0004690885543823, + "sampling/importance_sampling_ratio/min": 0.5917854309082031, + "sampling/sampling_logp_difference/max": 0.5246111154556274, + "sampling/sampling_logp_difference/mean": 0.012911073863506317, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 205.546875, + "completions/mean_terminated_length": 205.546875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3549535572528839, + "epoch": 0.6384803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021210397741436706, + "kl": 0.026987168937921524, + "learning_rate": 9.623815283866015e-07, + "loss": 0.0003, + "num_tokens": 16425770.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3158849477767944, + "sampling/importance_sampling_ratio/mean": 0.9997355937957764, + "sampling/importance_sampling_ratio/min": 0.6550066471099854, + "sampling/sampling_logp_difference/max": 0.42310988903045654, + "sampling/sampling_logp_difference/mean": 0.013649879954755306, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 157.140625, + "completions/mean_terminated_length": 157.140625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3509664833545685, + "epoch": 0.6397058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0332528861772406, + "kl": 0.033507656306028366, + "learning_rate": 9.621099679309946e-07, + "loss": 0.0003, + "num_tokens": 16452371.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5231109857559204, + "sampling/importance_sampling_ratio/mean": 1.000582218170166, + "sampling/importance_sampling_ratio/min": 0.6256922483444214, + "sampling/sampling_logp_difference/max": 0.46889662742614746, + "sampling/sampling_logp_difference/mean": 0.015148179605603218, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 176.34375, + "completions/mean_terminated_length": 176.34375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.33977288007736206, + "epoch": 0.6409313725490197, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0450745515105564, + "kl": 0.030788611620664597, + "learning_rate": 9.618374694214285e-07, + "loss": 0.0184, + "num_tokens": 16477753.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.3973685503005981, + "sampling/importance_sampling_ratio/mean": 1.0008265972137451, + "sampling/importance_sampling_ratio/min": 0.6237364411354065, + "sampling/sampling_logp_difference/max": 0.4720273017883301, + "sampling/sampling_logp_difference/mean": 0.013767888769507408, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 235.328125, + "completions/mean_terminated_length": 235.328125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4321310818195343, + "epoch": 0.6421568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02730276239242315, + "kl": 0.03425852209329605, + "learning_rate": 9.615640334110578e-07, + "loss": 0.0003, + "num_tokens": 16515166.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6193687915802002, + "sampling/importance_sampling_ratio/mean": 1.0003039836883545, + "sampling/importance_sampling_ratio/min": 0.6216926574707031, + "sampling/sampling_logp_difference/max": 0.4820363521575928, + "sampling/sampling_logp_difference/mean": 0.014964373782277107, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 189.5, + "completions/mean_terminated_length": 189.5, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4211269021034241, + "epoch": 0.6433823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9895386822369383, + "kl": 0.031260281801223755, + "learning_rate": 9.612896604549401e-07, + "loss": 0.0008, + "num_tokens": 16542158.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3415215015411377, + "sampling/importance_sampling_ratio/mean": 0.9998394250869751, + "sampling/importance_sampling_ratio/min": 0.5685389041900635, + "sampling/sampling_logp_difference/max": 0.5646855235099792, + "sampling/sampling_logp_difference/mean": 0.01657898537814617, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 210.1875, + "completions/mean_terminated_length": 210.1875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.39013925194740295, + "epoch": 0.6446078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021518376420771413, + "kl": 0.025803860276937485, + "learning_rate": 9.610143511100354e-07, + "loss": 0.0003, + "num_tokens": 16570490.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.507156491279602, + "sampling/importance_sampling_ratio/mean": 1.000083565711975, + "sampling/importance_sampling_ratio/min": 0.6335555911064148, + "sampling/sampling_logp_difference/max": 0.4564075469970703, + "sampling/sampling_logp_difference/mean": 0.014585510827600956, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 228.59375, + "completions/mean_terminated_length": 228.59375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.37804198265075684, + "epoch": 0.6458333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6245177947861602, + "kl": 0.030640294775366783, + "learning_rate": 9.607381059352038e-07, + "loss": 0.0113, + "num_tokens": 16605312.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.611337423324585, + "sampling/importance_sampling_ratio/mean": 0.9999603033065796, + "sampling/importance_sampling_ratio/min": 0.6217000484466553, + "sampling/sampling_logp_difference/max": 0.4770646095275879, + "sampling/sampling_logp_difference/mean": 0.014867281541228294, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 191.8125, + "completions/mean_terminated_length": 191.8125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.353499174118042, + "epoch": 0.6470588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01948708633875749, + "kl": 0.023790864273905754, + "learning_rate": 9.60460925491206e-07, + "loss": 0.0002, + "num_tokens": 16638004.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4038140773773193, + "sampling/importance_sampling_ratio/mean": 0.9999080896377563, + "sampling/importance_sampling_ratio/min": 0.6374849677085876, + "sampling/sampling_logp_difference/max": 0.4502246379852295, + "sampling/sampling_logp_difference/mean": 0.014271529391407967, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 159.15625, + "completions/mean_terminated_length": 159.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4149744510650635, + "epoch": 0.6482843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9070816966077233, + "kl": 0.03448011726140976, + "learning_rate": 9.601828103407004e-07, + "loss": -0.0021, + "num_tokens": 16669070.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4417411088943481, + "sampling/importance_sampling_ratio/mean": 0.9999873638153076, + "sampling/importance_sampling_ratio/min": 0.7591252326965332, + "sampling/sampling_logp_difference/max": 0.36585140228271484, + "sampling/sampling_logp_difference/mean": 0.015302242711186409, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 204.765625, + "completions/mean_terminated_length": 204.765625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3343036472797394, + "epoch": 0.6495098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019794629333848835, + "kl": 0.02628006413578987, + "learning_rate": 9.599037610482433e-07, + "loss": 0.0003, + "num_tokens": 16699983.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.290763258934021, + "sampling/importance_sampling_ratio/mean": 1.0000771284103394, + "sampling/importance_sampling_ratio/min": 0.6042301058769226, + "sampling/sampling_logp_difference/max": 0.5038001537322998, + "sampling/sampling_logp_difference/mean": 0.014323609881103039, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 182.328125, + "completions/mean_terminated_length": 182.328125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3478272259235382, + "epoch": 0.6507352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02157603009356838, + "kl": 0.02507212944328785, + "learning_rate": 9.59623778180287e-07, + "loss": 0.0002, + "num_tokens": 16730036.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.432883381843567, + "sampling/importance_sampling_ratio/mean": 1.0006635189056396, + "sampling/importance_sampling_ratio/min": 0.7574724555015564, + "sampling/sampling_logp_difference/max": 0.35968875885009766, + "sampling/sampling_logp_difference/mean": 0.014315593987703323, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 184.5, + "completions/mean_terminated_length": 184.5, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.41078031063079834, + "epoch": 0.6519607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0024160231052435, + "kl": 0.03048519417643547, + "learning_rate": 9.593428623051791e-07, + "loss": -0.0255, + "num_tokens": 16759796.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4374518394470215, + "sampling/importance_sampling_ratio/mean": 0.9993870258331299, + "sampling/importance_sampling_ratio/min": 0.6051985621452332, + "sampling/sampling_logp_difference/max": 0.5021986961364746, + "sampling/sampling_logp_difference/mean": 0.016140135005116463, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 192.734375, + "completions/mean_terminated_length": 192.734375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35647279024124146, + "epoch": 0.6531862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019612389785877708, + "kl": 0.02538762241601944, + "learning_rate": 9.59061013993161e-07, + "loss": 0.0002, + "num_tokens": 16793795.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5227993726730347, + "sampling/importance_sampling_ratio/mean": 0.9999618530273438, + "sampling/importance_sampling_ratio/min": 0.6399714350700378, + "sampling/sampling_logp_difference/max": 0.4463317394256592, + "sampling/sampling_logp_difference/mean": 0.013458995148539543, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 209.6875, + "completions/mean_terminated_length": 209.6875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3694957196712494, + "epoch": 0.6544117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5830303275611716, + "kl": 0.027595968917012215, + "learning_rate": 9.587782338163667e-07, + "loss": -0.0027, + "num_tokens": 16826191.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.543566346168518, + "sampling/importance_sampling_ratio/mean": 1.000403642654419, + "sampling/importance_sampling_ratio/min": 0.6606270670890808, + "sampling/sampling_logp_difference/max": 0.43409550189971924, + "sampling/sampling_logp_difference/mean": 0.013270800933241844, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 218.65625, + "completions/mean_terminated_length": 218.65625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4609197676181793, + "epoch": 0.6556372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021030880073322496, + "kl": 0.02733190357685089, + "learning_rate": 9.584945223488226e-07, + "loss": 0.0003, + "num_tokens": 16861129.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2990570068359375, + "sampling/importance_sampling_ratio/mean": 0.999771237373352, + "sampling/importance_sampling_ratio/min": 0.6057155132293701, + "sampling/sampling_logp_difference/max": 0.5013449192047119, + "sampling/sampling_logp_difference/mean": 0.014958461746573448, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 209.5, + "completions/mean_terminated_length": 209.5, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.44152161478996277, + "epoch": 0.6568627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021595070807418862, + "kl": 0.025443298742175102, + "learning_rate": 9.582098801664443e-07, + "loss": 0.0003, + "num_tokens": 16893961.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4320579767227173, + "sampling/importance_sampling_ratio/mean": 0.9998829364776611, + "sampling/importance_sampling_ratio/min": 0.6851339340209961, + "sampling/sampling_logp_difference/max": 0.378140926361084, + "sampling/sampling_logp_difference/mean": 0.015051309950649738, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 229.4375, + "completions/mean_terminated_length": 229.4375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.39369893074035645, + "epoch": 0.6580882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019634264798132458, + "kl": 0.027372129261493683, + "learning_rate": 9.579243078470378e-07, + "loss": 0.0003, + "num_tokens": 16929557.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.485055685043335, + "sampling/importance_sampling_ratio/mean": 0.9997790455818176, + "sampling/importance_sampling_ratio/min": 0.5497868061065674, + "sampling/sampling_logp_difference/max": 0.5982246398925781, + "sampling/sampling_logp_difference/mean": 0.014482814818620682, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 204.125, + "completions/mean_terminated_length": 204.125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.5653814077377319, + "epoch": 0.6593137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9988587526287372, + "kl": 0.03801679238677025, + "learning_rate": 9.576378059702968e-07, + "loss": -0.0023, + "num_tokens": 16963741.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.436959981918335, + "sampling/importance_sampling_ratio/mean": 1.0001435279846191, + "sampling/importance_sampling_ratio/min": 0.7102527022361755, + "sampling/sampling_logp_difference/max": 0.3625297546386719, + "sampling/sampling_logp_difference/mean": 0.017763778567314148, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 205.3125, + "completions/mean_terminated_length": 205.3125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.43430545926094055, + "epoch": 0.6605392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021384107168272663, + "kl": 0.03111300617456436, + "learning_rate": 9.573503751178018e-07, + "loss": 0.0003, + "num_tokens": 16996177.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.643847942352295, + "sampling/importance_sampling_ratio/mean": 0.9997886419296265, + "sampling/importance_sampling_ratio/min": 0.6688855290412903, + "sampling/sampling_logp_difference/max": 0.497039794921875, + "sampling/sampling_logp_difference/mean": 0.014810606837272644, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 181.546875, + "completions/mean_terminated_length": 181.546875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.5582599639892578, + "epoch": 0.6617647058823529, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3363543230620318, + "kl": 0.038329631090164185, + "learning_rate": 9.570620158730194e-07, + "loss": 0.0041, + "num_tokens": 17034692.0, + "reward": 0.375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.3978967666625977, + "sampling/importance_sampling_ratio/mean": 1.0004541873931885, + "sampling/importance_sampling_ratio/min": 0.6934731006622314, + "sampling/sampling_logp_difference/max": 0.3660428524017334, + "sampling/sampling_logp_difference/mean": 0.01722395420074463, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 199.890625, + "completions/mean_terminated_length": 199.890625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4284636974334717, + "epoch": 0.6629901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024349416401760998, + "kl": 0.03167618438601494, + "learning_rate": 9.567727288213004e-07, + "loss": 0.0003, + "num_tokens": 17064957.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.341268539428711, + "sampling/importance_sampling_ratio/mean": 1.0001959800720215, + "sampling/importance_sampling_ratio/min": 0.4498509168624878, + "sampling/sampling_logp_difference/max": 0.7988390922546387, + "sampling/sampling_logp_difference/mean": 0.01536840945482254, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 208.125, + "completions/mean_terminated_length": 208.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.39415794610977173, + "epoch": 0.6642156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03048027674448327, + "kl": 0.03377959877252579, + "learning_rate": 9.564825145498793e-07, + "loss": 0.0003, + "num_tokens": 17096053.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4786802530288696, + "sampling/importance_sampling_ratio/mean": 1.0002336502075195, + "sampling/importance_sampling_ratio/min": 0.6484001278877258, + "sampling/sampling_logp_difference/max": 0.43324732780456543, + "sampling/sampling_logp_difference/mean": 0.014283658936619759, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 187.53125, + "completions/mean_terminated_length": 187.53125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4927505850791931, + "epoch": 0.6654411764705882, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3669374222846087, + "kl": 0.03446304798126221, + "learning_rate": 9.561913736478728e-07, + "loss": 0.0567, + "num_tokens": 17128455.0, + "reward": 0.21875, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6559178829193115, + "sampling/importance_sampling_ratio/mean": 1.000101089477539, + "sampling/importance_sampling_ratio/min": 0.6176440119743347, + "sampling/sampling_logp_difference/max": 0.5043554306030273, + "sampling/sampling_logp_difference/mean": 0.01626628078520298, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 186.734375, + "completions/mean_terminated_length": 186.734375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3815724551677704, + "epoch": 0.6666666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021800493826795702, + "kl": 0.02597838081419468, + "learning_rate": 9.558993067062784e-07, + "loss": 0.0003, + "num_tokens": 17157414.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3621898889541626, + "sampling/importance_sampling_ratio/mean": 1.000091314315796, + "sampling/importance_sampling_ratio/min": 0.5554914474487305, + "sampling/sampling_logp_difference/max": 0.5879020690917969, + "sampling/sampling_logp_difference/mean": 0.014378965832293034, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 206.6875, + "completions/mean_terminated_length": 206.6875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.42119675874710083, + "epoch": 0.6678921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9023483510896259, + "kl": 0.028189023956656456, + "learning_rate": 9.556063143179735e-07, + "loss": 0.0198, + "num_tokens": 17194594.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3934311866760254, + "sampling/importance_sampling_ratio/mean": 0.9997861981391907, + "sampling/importance_sampling_ratio/min": 0.6242069005966187, + "sampling/sampling_logp_difference/max": 0.47127342224121094, + "sampling/sampling_logp_difference/mean": 0.014654896222054958, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 173.03125, + "completions/mean_terminated_length": 173.03125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4660423696041107, + "epoch": 0.6691176470588235, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0264795769518509, + "kl": 0.034461360424757004, + "learning_rate": 9.55312397077714e-07, + "loss": 0.0338, + "num_tokens": 17222260.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.808357834815979, + "sampling/importance_sampling_ratio/mean": 0.9998067617416382, + "sampling/importance_sampling_ratio/min": 0.7175599932670593, + "sampling/sampling_logp_difference/max": 0.5924191474914551, + "sampling/sampling_logp_difference/mean": 0.01736539974808693, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 218.71875, + "completions/mean_terminated_length": 218.71875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.34565114974975586, + "epoch": 0.6703431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0188774558366317, + "kl": 0.024478325620293617, + "learning_rate": 9.550175555821334e-07, + "loss": 0.0002, + "num_tokens": 17255666.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6260552406311035, + "sampling/importance_sampling_ratio/mean": 0.9997800588607788, + "sampling/importance_sampling_ratio/min": 0.6128374934196472, + "sampling/sampling_logp_difference/max": 0.4896554946899414, + "sampling/sampling_logp_difference/mean": 0.013118880800902843, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 187.59375, + "completions/mean_terminated_length": 187.59375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4557947814464569, + "epoch": 0.6715686274509803, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9563803894898965, + "kl": 0.02949117124080658, + "learning_rate": 9.547217904297409e-07, + "loss": 0.0284, + "num_tokens": 17285576.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.4972301721572876, + "sampling/importance_sampling_ratio/mean": 1.0004384517669678, + "sampling/importance_sampling_ratio/min": 0.674802303314209, + "sampling/sampling_logp_difference/max": 0.40361690521240234, + "sampling/sampling_logp_difference/mean": 0.016303321346640587, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 172.328125, + "completions/mean_terminated_length": 172.328125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4239822030067444, + "epoch": 0.6727941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025517293583861147, + "kl": 0.03518486022949219, + "learning_rate": 9.544251022209216e-07, + "loss": 0.0004, + "num_tokens": 17320141.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5744264125823975, + "sampling/importance_sampling_ratio/mean": 1.0005159378051758, + "sampling/importance_sampling_ratio/min": 0.7415908575057983, + "sampling/sampling_logp_difference/max": 0.4538910388946533, + "sampling/sampling_logp_difference/mean": 0.015085499733686447, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 147.625, + "completions/mean_terminated_length": 147.625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.29904037714004517, + "epoch": 0.6740196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024688332129142475, + "kl": 0.02920544147491455, + "learning_rate": 9.541274915579334e-07, + "loss": 0.0003, + "num_tokens": 17344965.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4720820188522339, + "sampling/importance_sampling_ratio/mean": 0.9996750354766846, + "sampling/importance_sampling_ratio/min": 0.4936216473579407, + "sampling/sampling_logp_difference/max": 0.7059860229492188, + "sampling/sampling_logp_difference/mean": 0.013709956780076027, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 208.609375, + "completions/mean_terminated_length": 208.609375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.392837256193161, + "epoch": 0.6752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020842003603693177, + "kl": 0.02458246424794197, + "learning_rate": 9.538289590449071e-07, + "loss": 0.0002, + "num_tokens": 17377772.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5609105825424194, + "sampling/importance_sampling_ratio/mean": 0.9998144507408142, + "sampling/importance_sampling_ratio/min": 0.590291440486908, + "sampling/sampling_logp_difference/max": 0.5271389484405518, + "sampling/sampling_logp_difference/mean": 0.013637501746416092, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 169.25, + "completions/mean_terminated_length": 169.25, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.40312328934669495, + "epoch": 0.6764705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027996955651263342, + "kl": 0.0280432291328907, + "learning_rate": 9.535295052878449e-07, + "loss": 0.0003, + "num_tokens": 17408492.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4875401258468628, + "sampling/importance_sampling_ratio/mean": 1.0007259845733643, + "sampling/importance_sampling_ratio/min": 0.7447932362556458, + "sampling/sampling_logp_difference/max": 0.3971238136291504, + "sampling/sampling_logp_difference/mean": 0.014545347541570663, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 133.171875, + "completions/mean_terminated_length": 133.171875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.30149027705192566, + "epoch": 0.6776960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034764274567367234, + "kl": 0.03032582998275757, + "learning_rate": 9.53229130894619e-07, + "loss": 0.0003, + "num_tokens": 17433271.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5182085037231445, + "sampling/importance_sampling_ratio/mean": 0.9995010495185852, + "sampling/importance_sampling_ratio/min": 0.6125888228416443, + "sampling/sampling_logp_difference/max": 0.49006128311157227, + "sampling/sampling_logp_difference/mean": 0.014048278331756592, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 195.5625, + "completions/mean_terminated_length": 195.5625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.45025965571403503, + "epoch": 0.678921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2667513377636765, + "kl": 0.035085082054138184, + "learning_rate": 9.529278364749702e-07, + "loss": 0.0295, + "num_tokens": 17465835.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4873110055923462, + "sampling/importance_sampling_ratio/mean": 0.9998199343681335, + "sampling/importance_sampling_ratio/min": 0.6853278279304504, + "sampling/sampling_logp_difference/max": 0.3969697952270508, + "sampling/sampling_logp_difference/mean": 0.016888249665498734, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 191.28125, + "completions/mean_terminated_length": 191.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4631187319755554, + "epoch": 0.6801470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3981282689064027, + "kl": 0.03789568692445755, + "learning_rate": 9.526256226405073e-07, + "loss": -0.0016, + "num_tokens": 17494589.0, + "reward": 0.3125, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4454609155654907, + "sampling/importance_sampling_ratio/mean": 0.9997762441635132, + "sampling/importance_sampling_ratio/min": 0.4871583580970764, + "sampling/sampling_logp_difference/max": 0.7191660404205322, + "sampling/sampling_logp_difference/mean": 0.015756256878376007, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 155.90625, + "completions/mean_terminated_length": 155.90625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3670254349708557, + "epoch": 0.6813725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9368330927073913, + "kl": 0.03317684680223465, + "learning_rate": 9.523224900047051e-07, + "loss": 0.0142, + "num_tokens": 17518631.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4525972604751587, + "sampling/importance_sampling_ratio/mean": 0.9997410774230957, + "sampling/importance_sampling_ratio/min": 0.677254319190979, + "sampling/sampling_logp_difference/max": 0.38970839977264404, + "sampling/sampling_logp_difference/mean": 0.014513498172163963, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 168.484375, + "completions/mean_terminated_length": 168.484375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4081436097621918, + "epoch": 0.6825980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02472310940055831, + "kl": 0.02606888674199581, + "learning_rate": 9.520184391829036e-07, + "loss": 0.0003, + "num_tokens": 17550422.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6272908449172974, + "sampling/importance_sampling_ratio/mean": 1.0004005432128906, + "sampling/importance_sampling_ratio/min": 0.6198011636734009, + "sampling/sampling_logp_difference/max": 0.48691654205322266, + "sampling/sampling_logp_difference/mean": 0.015636038035154343, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 181.203125, + "completions/mean_terminated_length": 181.203125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4906782805919647, + "epoch": 0.6838235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06104910140848077, + "kl": 0.04024219140410423, + "learning_rate": 9.517134707923069e-07, + "loss": 0.0004, + "num_tokens": 17582147.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8212308883666992, + "sampling/importance_sampling_ratio/mean": 1.00010347366333, + "sampling/importance_sampling_ratio/min": 0.6550912261009216, + "sampling/sampling_logp_difference/max": 0.5995125770568848, + "sampling/sampling_logp_difference/mean": 0.018436823040246964, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 167.4375, + "completions/mean_terminated_length": 167.4375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.36911672353744507, + "epoch": 0.6850490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04059392221494458, + "kl": 0.03382398188114166, + "learning_rate": 9.514075854519813e-07, + "loss": 0.0003, + "num_tokens": 17609327.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4043327569961548, + "sampling/importance_sampling_ratio/mean": 0.9991638660430908, + "sampling/importance_sampling_ratio/min": 0.6094186902046204, + "sampling/sampling_logp_difference/max": 0.49524974822998047, + "sampling/sampling_logp_difference/mean": 0.014528224244713783, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 194.359375, + "completions/mean_terminated_length": 194.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4601079523563385, + "epoch": 0.6862745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8419090426673429, + "kl": 0.03641054779291153, + "learning_rate": 9.511007837828548e-07, + "loss": -0.0052, + "num_tokens": 17643894.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5000439882278442, + "sampling/importance_sampling_ratio/mean": 1.0002095699310303, + "sampling/importance_sampling_ratio/min": 0.6145036816596985, + "sampling/sampling_logp_difference/max": 0.4869403839111328, + "sampling/sampling_logp_difference/mean": 0.016088012605905533, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 226.3125, + "completions/mean_terminated_length": 226.3125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.43548086285591125, + "epoch": 0.6875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6768620242535359, + "kl": 0.025557810440659523, + "learning_rate": 9.507930664077153e-07, + "loss": 0.0145, + "num_tokens": 17681338.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3931565284729004, + "sampling/importance_sampling_ratio/mean": 0.9997798800468445, + "sampling/importance_sampling_ratio/min": 0.5388790965080261, + "sampling/sampling_logp_difference/max": 0.6182640194892883, + "sampling/sampling_logp_difference/mean": 0.015264216810464859, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 190.5, + "completions/mean_terminated_length": 190.5, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3668908476829529, + "epoch": 0.6887254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022978793147968482, + "kl": 0.024108467623591423, + "learning_rate": 9.504844339512094e-07, + "loss": 0.0002, + "num_tokens": 17713066.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000882148742676, + "sampling/importance_sampling_ratio/min": 0.729113757610321, + "sampling/sampling_logp_difference/max": 0.7371416091918945, + "sampling/sampling_logp_difference/mean": 0.012965286150574684, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 149.734375, + "completions/mean_terminated_length": 149.734375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.46979600191116333, + "epoch": 0.6899509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1338108304088952, + "kl": 0.058991335332393646, + "learning_rate": 9.501748870398419e-07, + "loss": 0.0062, + "num_tokens": 17740009.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6401453018188477, + "sampling/importance_sampling_ratio/mean": 1.00006103515625, + "sampling/importance_sampling_ratio/min": 0.6931049823760986, + "sampling/sampling_logp_difference/max": 0.4947848320007324, + "sampling/sampling_logp_difference/mean": 0.016827845945954323, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 158.125, + "completions/mean_terminated_length": 158.125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3002978265285492, + "epoch": 0.6911764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03904848237390688, + "kl": 0.03209100291132927, + "learning_rate": 9.498644263019731e-07, + "loss": 0.0003, + "num_tokens": 17769921.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4301660060882568, + "sampling/importance_sampling_ratio/mean": 1.0002093315124512, + "sampling/importance_sampling_ratio/min": 0.6262974739074707, + "sampling/sampling_logp_difference/max": 0.4679298400878906, + "sampling/sampling_logp_difference/mean": 0.013269246555864811, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 179.953125, + "completions/mean_terminated_length": 179.953125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.5559252500534058, + "epoch": 0.6924019607843137, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.703238584274116, + "kl": 0.06438381224870682, + "learning_rate": 9.495530523678186e-07, + "loss": 0.0092, + "num_tokens": 17801038.0, + "reward": -0.21875, + "reward_std": 0.676956295967102, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5438649654388428, + "sampling/importance_sampling_ratio/mean": 1.000163197517395, + "sampling/importance_sampling_ratio/min": 0.6384997963905334, + "sampling/sampling_logp_difference/max": 0.44863390922546387, + "sampling/sampling_logp_difference/mean": 0.01882471702992916, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 186.296875, + "completions/mean_terminated_length": 186.296875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.46670347452163696, + "epoch": 0.6936274509803921, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2810627770010172, + "kl": 0.05042688176035881, + "learning_rate": 9.492407658694477e-07, + "loss": -0.0106, + "num_tokens": 17828849.0, + "reward": 0.4375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.465358853340149, + "sampling/importance_sampling_ratio/mean": 1.0000073909759521, + "sampling/importance_sampling_ratio/min": 0.6191340684890747, + "sampling/sampling_logp_difference/max": 0.479433536529541, + "sampling/sampling_logp_difference/mean": 0.01790396124124527, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 173.859375, + "completions/mean_terminated_length": 173.859375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.44244450330734253, + "epoch": 0.6948529411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5968482256097214, + "kl": 0.05212203413248062, + "learning_rate": 9.489275674407825e-07, + "loss": -0.01, + "num_tokens": 17856760.0, + "reward": 0.34375, + "reward_std": 0.5809217691421509, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5171775817871094, + "sampling/importance_sampling_ratio/mean": 0.999706506729126, + "sampling/importance_sampling_ratio/min": 0.641122043132782, + "sampling/sampling_logp_difference/max": 0.444535493850708, + "sampling/sampling_logp_difference/mean": 0.016785025596618652, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 133.09375, + "completions/mean_terminated_length": 133.09375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.36766040325164795, + "epoch": 0.696078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06045360132262736, + "kl": 0.036403343081474304, + "learning_rate": 9.486134577175957e-07, + "loss": 0.0004, + "num_tokens": 17879662.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4707084894180298, + "sampling/importance_sampling_ratio/mean": 0.9995070695877075, + "sampling/importance_sampling_ratio/min": 0.7074356079101562, + "sampling/sampling_logp_difference/max": 0.38574421405792236, + "sampling/sampling_logp_difference/mean": 0.01508602686226368, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 178.390625, + "completions/mean_terminated_length": 178.390625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.43458572030067444, + "epoch": 0.6973039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5381952728820387, + "kl": 0.04792370647192001, + "learning_rate": 9.482984373375104e-07, + "loss": 0.0129, + "num_tokens": 17910615.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4773439168930054, + "sampling/importance_sampling_ratio/mean": 1.0000048875808716, + "sampling/importance_sampling_ratio/min": 0.7157042026519775, + "sampling/sampling_logp_difference/max": 0.39024579524993896, + "sampling/sampling_logp_difference/mean": 0.017831791192293167, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 193.40625, + "completions/mean_terminated_length": 193.40625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.48558875918388367, + "epoch": 0.6985294117647058, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6476404351655827, + "kl": 0.06947138905525208, + "learning_rate": 9.479825069399977e-07, + "loss": -0.0056, + "num_tokens": 17940721.0, + "reward": 0.125, + "reward_std": 0.6831300258636475, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.7248539924621582, + "sampling/importance_sampling_ratio/mean": 0.9997266530990601, + "sampling/importance_sampling_ratio/min": 0.6415937542915344, + "sampling/sampling_logp_difference/max": 0.545142412185669, + "sampling/sampling_logp_difference/mean": 0.01905631273984909, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 196.40625, + "completions/mean_terminated_length": 196.40625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4202972948551178, + "epoch": 0.6997549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2268689030320934, + "kl": 0.03882734477519989, + "learning_rate": 9.476656671663766e-07, + "loss": -0.0139, + "num_tokens": 17975355.0, + "reward": 0.6875, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5569466352462769, + "sampling/importance_sampling_ratio/mean": 0.9996051788330078, + "sampling/importance_sampling_ratio/min": 0.6746933460235596, + "sampling/sampling_logp_difference/max": 0.44272661209106445, + "sampling/sampling_logp_difference/mean": 0.015455996617674828, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 163.25, + "completions/mean_terminated_length": 163.25, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.34861987829208374, + "epoch": 0.7009803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03036379946318668, + "kl": 0.032124996185302734, + "learning_rate": 9.473479186598114e-07, + "loss": 0.0003, + "num_tokens": 18002155.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.428978443145752, + "sampling/importance_sampling_ratio/mean": 1.0002634525299072, + "sampling/importance_sampling_ratio/min": 0.6447941064834595, + "sampling/sampling_logp_difference/max": 0.43882429599761963, + "sampling/sampling_logp_difference/mean": 0.016839023679494858, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 183.921875, + "completions/mean_terminated_length": 183.921875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3184646964073181, + "epoch": 0.7022058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8855265123782843, + "kl": 0.03036251664161682, + "learning_rate": 9.470292620653119e-07, + "loss": -0.019, + "num_tokens": 18030918.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5637354850769043, + "sampling/importance_sampling_ratio/mean": 0.9997586011886597, + "sampling/importance_sampling_ratio/min": 0.6622517704963684, + "sampling/sampling_logp_difference/max": 0.44707751274108887, + "sampling/sampling_logp_difference/mean": 0.013248957693576813, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 232.34375, + "completions/mean_terminated_length": 232.34375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.4914621412754059, + "epoch": 0.7034313725490197, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.6498129315873309, + "kl": 0.04989838972687721, + "learning_rate": 9.467096980297304e-07, + "loss": 0.0529, + "num_tokens": 18063852.0, + "reward": 0.6875, + "reward_std": 0.690913200378418, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5468924045562744, + "sampling/importance_sampling_ratio/mean": 0.9999454021453857, + "sampling/importance_sampling_ratio/min": 0.6276541948318481, + "sampling/sampling_logp_difference/max": 0.46576595306396484, + "sampling/sampling_logp_difference/mean": 0.01687515154480934, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 232.953125, + "completions/mean_terminated_length": 232.953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.41729480028152466, + "epoch": 0.7046568627450981, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.327751073858522, + "kl": 0.05011233687400818, + "learning_rate": 9.463892272017618e-07, + "loss": 0.0013, + "num_tokens": 18099049.0, + "reward": 0.53125, + "reward_std": 0.7129635810852051, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007917881011963, + "sampling/importance_sampling_ratio/min": 0.6794153451919556, + "sampling/sampling_logp_difference/max": 0.846982479095459, + "sampling/sampling_logp_difference/mean": 0.014939257875084877, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 184.359375, + "completions/mean_terminated_length": 184.359375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4155169427394867, + "epoch": 0.7058823529411765, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.840841066458298, + "kl": 0.040389060974121094, + "learning_rate": 9.460678502319416e-07, + "loss": -0.051, + "num_tokens": 18126704.0, + "reward": -0.03125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3847591876983643, + "sampling/importance_sampling_ratio/mean": 0.999739408493042, + "sampling/importance_sampling_ratio/min": 0.63115394115448, + "sampling/sampling_logp_difference/max": 0.4602055549621582, + "sampling/sampling_logp_difference/mean": 0.01599350944161415, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 288.578125, + "completions/mean_terminated_length": 288.578125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.397461861371994, + "epoch": 0.7071078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8568972578823097, + "kl": 0.02606649324297905, + "learning_rate": 9.457455677726447e-07, + "loss": -0.002, + "num_tokens": 18166325.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4671045541763306, + "sampling/importance_sampling_ratio/mean": 0.9998229742050171, + "sampling/importance_sampling_ratio/min": 0.6743451356887817, + "sampling/sampling_logp_difference/max": 0.39401328563690186, + "sampling/sampling_logp_difference/mean": 0.014239683747291565, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 298.015625, + "completions/mean_terminated_length": 298.015625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3736385107040405, + "epoch": 0.7083333333333334, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6144241358813192, + "kl": 0.026287073269486427, + "learning_rate": 9.454223804780841e-07, + "loss": 0.0106, + "num_tokens": 18203526.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5859181880950928, + "sampling/importance_sampling_ratio/mean": 1.000488519668579, + "sampling/importance_sampling_ratio/min": 0.6167355179786682, + "sampling/sampling_logp_difference/max": 0.48331499099731445, + "sampling/sampling_logp_difference/mean": 0.013991307467222214, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 304.25, + "completions/mean_terminated_length": 304.25, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.36850547790527344, + "epoch": 0.7095588235294118, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1139940989323576, + "kl": 0.03561537712812424, + "learning_rate": 9.450982890043094e-07, + "loss": -0.0003, + "num_tokens": 18243430.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4850008487701416, + "sampling/importance_sampling_ratio/mean": 0.9996840357780457, + "sampling/importance_sampling_ratio/min": 0.6319139003753662, + "sampling/sampling_logp_difference/max": 0.45900213718414307, + "sampling/sampling_logp_difference/mean": 0.01356898620724678, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 295.5, + "completions/mean_terminated_length": 295.5, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.4161064624786377, + "epoch": 0.7107843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6783415704971543, + "kl": 0.03330086171627045, + "learning_rate": 9.447732940092059e-07, + "loss": 0.0223, + "num_tokens": 18283046.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.493503212928772, + "sampling/importance_sampling_ratio/mean": 0.9998776912689209, + "sampling/importance_sampling_ratio/min": 0.6947564482688904, + "sampling/sampling_logp_difference/max": 0.4011244773864746, + "sampling/sampling_logp_difference/mean": 0.015456114895641804, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 291.359375, + "completions/mean_terminated_length": 291.359375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.45157065987586975, + "epoch": 0.7120098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8073574799969249, + "kl": 0.03743080794811249, + "learning_rate": 9.444473961524927e-07, + "loss": -0.0227, + "num_tokens": 18330797.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.592615008354187, + "sampling/importance_sampling_ratio/mean": 1.0003517866134644, + "sampling/importance_sampling_ratio/min": 0.45972323417663574, + "sampling/sampling_logp_difference/max": 0.7771306037902832, + "sampling/sampling_logp_difference/mean": 0.016215139999985695, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 270.40625, + "completions/mean_terminated_length": 270.40625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.34220874309539795, + "epoch": 0.7132352941176471, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8146313188462274, + "kl": 0.034279175102710724, + "learning_rate": 9.441205960957219e-07, + "loss": 0.052, + "num_tokens": 18367975.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8494188785552979, + "sampling/importance_sampling_ratio/mean": 0.9999802708625793, + "sampling/importance_sampling_ratio/min": 0.6302123069763184, + "sampling/sampling_logp_difference/max": 0.6148715019226074, + "sampling/sampling_logp_difference/mean": 0.013569234870374203, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 226.015625, + "completions/mean_terminated_length": 226.015625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3353632688522339, + "epoch": 0.7144607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05072026022762656, + "kl": 0.04300938546657562, + "learning_rate": 9.43792894502277e-07, + "loss": 0.0004, + "num_tokens": 18398888.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.521698236465454, + "sampling/importance_sampling_ratio/mean": 1.0002022981643677, + "sampling/importance_sampling_ratio/min": 0.608488917350769, + "sampling/sampling_logp_difference/max": 0.4967765808105469, + "sampling/sampling_logp_difference/mean": 0.014746149070560932, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 334.71875, + "completions/mean_terminated_length": 334.71875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.416095495223999, + "epoch": 0.7156862745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8687483700560874, + "kl": 0.03589534014463425, + "learning_rate": 9.434642920373713e-07, + "loss": -0.0082, + "num_tokens": 18442230.0, + "reward": 0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.3913553953170776, + "sampling/importance_sampling_ratio/mean": 0.9997699856758118, + "sampling/importance_sampling_ratio/min": 0.7111809849739075, + "sampling/sampling_logp_difference/max": 0.3408282995223999, + "sampling/sampling_logp_difference/mean": 0.013978583738207817, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 252.78125, + "completions/mean_terminated_length": 252.78125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3994787335395813, + "epoch": 0.7169117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7470459942229324, + "kl": 0.048064589500427246, + "learning_rate": 9.431347893680472e-07, + "loss": -0.0062, + "num_tokens": 18473192.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.4770106077194214, + "sampling/importance_sampling_ratio/mean": 0.9999533295631409, + "sampling/importance_sampling_ratio/min": 0.6099969744682312, + "sampling/sampling_logp_difference/max": 0.49430131912231445, + "sampling/sampling_logp_difference/mean": 0.016215935349464417, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 266.203125, + "completions/mean_terminated_length": 266.203125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.2780820429325104, + "epoch": 0.7181372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05454274155724163, + "kl": 0.0379292257130146, + "learning_rate": 9.428043871631739e-07, + "loss": 0.0004, + "num_tokens": 18505957.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4753788709640503, + "sampling/importance_sampling_ratio/mean": 1.0000276565551758, + "sampling/importance_sampling_ratio/min": 0.6772292852401733, + "sampling/sampling_logp_difference/max": 0.3897453546524048, + "sampling/sampling_logp_difference/mean": 0.011633490212261677, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 378.421875, + "completions/mean_terminated_length": 378.421875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.4514898359775543, + "epoch": 0.7193627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6391472605750311, + "kl": 0.038221873342990875, + "learning_rate": 9.424730860934472e-07, + "loss": -0.0202, + "num_tokens": 18553408.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5621294975280762, + "sampling/importance_sampling_ratio/mean": 1.0005515813827515, + "sampling/importance_sampling_ratio/min": 0.6183229684829712, + "sampling/sampling_logp_difference/max": 0.4807443618774414, + "sampling/sampling_logp_difference/mean": 0.015158019959926605, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 216.390625, + "completions/mean_terminated_length": 216.390625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.34938451647758484, + "epoch": 0.7205882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03940199990870377, + "kl": 0.047378502786159515, + "learning_rate": 9.421408868313873e-07, + "loss": 0.0004, + "num_tokens": 18581321.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4404903650283813, + "sampling/importance_sampling_ratio/mean": 1.0000039339065552, + "sampling/importance_sampling_ratio/min": 0.6431145071983337, + "sampling/sampling_logp_difference/max": 0.44143247604370117, + "sampling/sampling_logp_difference/mean": 0.014487783424556255, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 281.59375, + "completions/mean_terminated_length": 281.59375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3534771203994751, + "epoch": 0.7218137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03664400527724794, + "kl": 0.048589833080768585, + "learning_rate": 9.418077900513376e-07, + "loss": 0.0004, + "num_tokens": 18616127.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5671957731246948, + "sampling/importance_sampling_ratio/mean": 1.0001347064971924, + "sampling/importance_sampling_ratio/min": 0.7090094685554504, + "sampling/sampling_logp_difference/max": 0.44928789138793945, + "sampling/sampling_logp_difference/mean": 0.013496083207428455, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 254.546875, + "completions/mean_terminated_length": 254.546875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.4544410705566406, + "epoch": 0.7230392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16074988784042146, + "kl": 0.05074925348162651, + "learning_rate": 9.414737964294634e-07, + "loss": 0.0005, + "num_tokens": 18653218.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7520911693572998, + "sampling/importance_sampling_ratio/mean": 1.0004611015319824, + "sampling/importance_sampling_ratio/min": 0.09077741950750351, + "sampling/sampling_logp_difference/max": 2.3993446826934814, + "sampling/sampling_logp_difference/mean": 0.016986709088087082, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 343.40625, + "completions/mean_terminated_length": 343.40625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.36447951197624207, + "epoch": 0.7242647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02865242764091762, + "kl": 0.03275573253631592, + "learning_rate": 9.411389066437507e-07, + "loss": 0.0003, + "num_tokens": 18696428.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5326827764511108, + "sampling/importance_sampling_ratio/mean": 1.0002455711364746, + "sampling/importance_sampling_ratio/min": 0.6872919201850891, + "sampling/sampling_logp_difference/max": 0.4270195960998535, + "sampling/sampling_logp_difference/mean": 0.012679225765168667, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 274.859375, + "completions/mean_terminated_length": 274.859375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.38698798418045044, + "epoch": 0.7254901960784313, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.900943564636299, + "kl": 0.05070265009999275, + "learning_rate": 9.408031213740044e-07, + "loss": -0.0006, + "num_tokens": 18730931.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6438260078430176, + "sampling/importance_sampling_ratio/mean": 0.9993126392364502, + "sampling/importance_sampling_ratio/min": 0.689705491065979, + "sampling/sampling_logp_difference/max": 0.4970264434814453, + "sampling/sampling_logp_difference/mean": 0.013545414432883263, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 358.34375, + "completions/mean_terminated_length": 358.34375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.36991560459136963, + "epoch": 0.7267156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5301147887989228, + "kl": 0.04598201811313629, + "learning_rate": 9.404664413018476e-07, + "loss": -0.0099, + "num_tokens": 18775785.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7139068841934204, + "sampling/importance_sampling_ratio/mean": 1.0006451606750488, + "sampling/importance_sampling_ratio/min": 0.4161494970321655, + "sampling/sampling_logp_difference/max": 0.8767106533050537, + "sampling/sampling_logp_difference/mean": 0.014390267431735992, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 311.6875, + "completions/mean_terminated_length": 311.6875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.4156648516654968, + "epoch": 0.7279411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5723504706838156, + "kl": 0.03343087434768677, + "learning_rate": 9.401288671107193e-07, + "loss": -0.0006, + "num_tokens": 18816101.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6876107454299927, + "sampling/importance_sampling_ratio/mean": 1.0000860691070557, + "sampling/importance_sampling_ratio/min": 0.6515225768089294, + "sampling/sampling_logp_difference/max": 0.5233137607574463, + "sampling/sampling_logp_difference/mean": 0.0153332045301795, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 342.5625, + "completions/mean_terminated_length": 342.5625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.40205198526382446, + "epoch": 0.7291666666666666, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7865444424660837, + "kl": 0.03984655812382698, + "learning_rate": 9.397903994858735e-07, + "loss": 0.0438, + "num_tokens": 18857817.0, + "reward": 0.21875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4517771005630493, + "sampling/importance_sampling_ratio/mean": 0.9999061822891235, + "sampling/importance_sampling_ratio/min": 0.6904515624046326, + "sampling/sampling_logp_difference/max": 0.3727884292602539, + "sampling/sampling_logp_difference/mean": 0.012401677668094635, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 280.765625, + "completions/mean_terminated_length": 280.765625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.36197447776794434, + "epoch": 0.7303921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040535233546695075, + "kl": 0.049834877252578735, + "learning_rate": 9.394510391143786e-07, + "loss": 0.0004, + "num_tokens": 18891242.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.42715585231781, + "sampling/importance_sampling_ratio/mean": 0.9997456073760986, + "sampling/importance_sampling_ratio/min": 0.6623404026031494, + "sampling/sampling_logp_difference/max": 0.411975622177124, + "sampling/sampling_logp_difference/mean": 0.013757757842540741, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 399.515625, + "completions/mean_terminated_length": 399.515625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.34008845686912537, + "epoch": 0.7316176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025416439014180003, + "kl": 0.030293263494968414, + "learning_rate": 9.391107866851142e-07, + "loss": 0.0003, + "num_tokens": 18949787.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6054352521896362, + "sampling/importance_sampling_ratio/mean": 1.0004926919937134, + "sampling/importance_sampling_ratio/min": 0.5919367074966431, + "sampling/sampling_logp_difference/max": 0.5243555307388306, + "sampling/sampling_logp_difference/mean": 0.012024283409118652, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 234.0625, + "completions/mean_terminated_length": 234.0625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4019208550453186, + "epoch": 0.7328431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04831611945362319, + "kl": 0.057452306151390076, + "learning_rate": 9.387696428887715e-07, + "loss": 0.0005, + "num_tokens": 18979167.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6169447898864746, + "sampling/importance_sampling_ratio/mean": 1.0006022453308105, + "sampling/importance_sampling_ratio/min": 0.5576868057250977, + "sampling/sampling_logp_difference/max": 0.5839577913284302, + "sampling/sampling_logp_difference/mean": 0.0151644516736269, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 221.75, + "completions/mean_terminated_length": 221.75, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3913234770298004, + "epoch": 0.7340686274509803, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8912620717071558, + "kl": 0.07460720092058182, + "learning_rate": 9.384276084178504e-07, + "loss": 0.0116, + "num_tokens": 19007551.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.448309302330017, + "sampling/importance_sampling_ratio/mean": 0.9996531009674072, + "sampling/importance_sampling_ratio/min": 0.6317389011383057, + "sampling/sampling_logp_difference/max": 0.45927906036376953, + "sampling/sampling_logp_difference/mean": 0.01611633598804474, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 258.28125, + "completions/mean_terminated_length": 258.28125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.47644010186195374, + "epoch": 0.7352941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03643086357138928, + "kl": 0.04496239125728607, + "learning_rate": 9.380846839666595e-07, + "loss": 0.0004, + "num_tokens": 19056913.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7568045854568481, + "sampling/importance_sampling_ratio/mean": 1.0003657341003418, + "sampling/importance_sampling_ratio/min": 0.6972380876541138, + "sampling/sampling_logp_difference/max": 0.5634965896606445, + "sampling/sampling_logp_difference/mean": 0.014628026634454727, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 273.390625, + "completions/mean_terminated_length": 273.390625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.3627666234970093, + "epoch": 0.7365196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03865247668415776, + "kl": 0.05126260966062546, + "learning_rate": 9.377408702313136e-07, + "loss": 0.0005, + "num_tokens": 19090426.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4355396032333374, + "sampling/importance_sampling_ratio/mean": 0.9997859001159668, + "sampling/importance_sampling_ratio/min": 0.6249046325683594, + "sampling/sampling_logp_difference/max": 0.470156192779541, + "sampling/sampling_logp_difference/mean": 0.014393947087228298, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 242.046875, + "completions/mean_terminated_length": 242.046875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.3892621397972107, + "epoch": 0.7377450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03507889196843655, + "kl": 0.0562661848962307, + "learning_rate": 9.37396167909733e-07, + "loss": 0.0005, + "num_tokens": 19124669.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7792092561721802, + "sampling/importance_sampling_ratio/mean": 0.9994032979011536, + "sampling/importance_sampling_ratio/min": 0.5018308758735657, + "sampling/sampling_logp_difference/max": 0.6894922256469727, + "sampling/sampling_logp_difference/mean": 0.01530286855995655, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 262.828125, + "completions/mean_terminated_length": 262.828125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.372028648853302, + "epoch": 0.7389705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051174383054545663, + "kl": 0.048788998275995255, + "learning_rate": 9.370505777016413e-07, + "loss": 0.0005, + "num_tokens": 19157810.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5744249820709229, + "sampling/importance_sampling_ratio/mean": 1.0003635883331299, + "sampling/importance_sampling_ratio/min": 0.7011212706565857, + "sampling/sampling_logp_difference/max": 0.4538900852203369, + "sampling/sampling_logp_difference/mean": 0.014637460000813007, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 245.4375, + "completions/mean_terminated_length": 245.4375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.40267762541770935, + "epoch": 0.7401960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03602906711181945, + "kl": 0.0445004478096962, + "learning_rate": 9.367041003085648e-07, + "loss": 0.0004, + "num_tokens": 19192510.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6365485191345215, + "sampling/importance_sampling_ratio/mean": 1.0004836320877075, + "sampling/importance_sampling_ratio/min": 0.6401402354240417, + "sampling/sampling_logp_difference/max": 0.49258947372436523, + "sampling/sampling_logp_difference/mean": 0.016224956139922142, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 296.890625, + "completions/mean_terminated_length": 296.890625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.4386707544326782, + "epoch": 0.741421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04674979673877471, + "kl": 0.03908385708928108, + "learning_rate": 9.363567364338307e-07, + "loss": 0.0004, + "num_tokens": 19233335.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3713226318359375, + "sampling/importance_sampling_ratio/mean": 0.9998978972434998, + "sampling/importance_sampling_ratio/min": 0.695604681968689, + "sampling/sampling_logp_difference/max": 0.362973690032959, + "sampling/sampling_logp_difference/mean": 0.014243248850107193, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 266.96875, + "completions/mean_terminated_length": 266.96875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.3504762351512909, + "epoch": 0.7426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6962367993223767, + "kl": 0.054517414420843124, + "learning_rate": 9.360084867825658e-07, + "loss": 0.013, + "num_tokens": 19270261.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6438707113265991, + "sampling/importance_sampling_ratio/mean": 1.000388264656067, + "sampling/importance_sampling_ratio/min": 0.6260436773300171, + "sampling/sampling_logp_difference/max": 0.4970536231994629, + "sampling/sampling_logp_difference/mean": 0.01413656771183014, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 246.703125, + "completions/mean_terminated_length": 246.703125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.39139968156814575, + "epoch": 0.7438725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029854538741684488, + "kl": 0.045577920973300934, + "learning_rate": 9.356593520616946e-07, + "loss": 0.0004, + "num_tokens": 19315426.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2952314615249634, + "sampling/importance_sampling_ratio/mean": 1.0002150535583496, + "sampling/importance_sampling_ratio/min": 0.698577344417572, + "sampling/sampling_logp_difference/max": 0.35870933532714844, + "sampling/sampling_logp_difference/mean": 0.013229596428573132, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 246.078125, + "completions/mean_terminated_length": 246.078125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.4169134795665741, + "epoch": 0.7450980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7135715337463644, + "kl": 0.049882128834724426, + "learning_rate": 9.353093329799386e-07, + "loss": 0.0057, + "num_tokens": 19345783.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4834805727005005, + "sampling/importance_sampling_ratio/mean": 0.9999332427978516, + "sampling/importance_sampling_ratio/min": 0.6186323165893555, + "sampling/sampling_logp_difference/max": 0.48024415969848633, + "sampling/sampling_logp_difference/mean": 0.015446479432284832, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 236.28125, + "completions/mean_terminated_length": 236.28125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3663399815559387, + "epoch": 0.7463235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7660803450669008, + "kl": 0.04421684145927429, + "learning_rate": 9.349584302478144e-07, + "loss": 0.0135, + "num_tokens": 19380681.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.3435853719711304, + "sampling/importance_sampling_ratio/mean": 0.9998900294303894, + "sampling/importance_sampling_ratio/min": 0.6743672490119934, + "sampling/sampling_logp_difference/max": 0.3939805030822754, + "sampling/sampling_logp_difference/mean": 0.013786327093839645, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 244.421875, + "completions/mean_terminated_length": 244.421875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.3699806034564972, + "epoch": 0.7475490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028794080921610312, + "kl": 0.032179079949855804, + "learning_rate": 9.346066445776321e-07, + "loss": 0.0003, + "num_tokens": 19414468.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5622196197509766, + "sampling/importance_sampling_ratio/mean": 0.9998641014099121, + "sampling/importance_sampling_ratio/min": 0.6900743246078491, + "sampling/sampling_logp_difference/max": 0.4461076259613037, + "sampling/sampling_logp_difference/mean": 0.012894706800580025, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 252.9375, + "completions/mean_terminated_length": 252.9375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.3648497760295868, + "epoch": 0.7487745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03219960872218517, + "kl": 0.04307214915752411, + "learning_rate": 9.342539766834945e-07, + "loss": 0.0004, + "num_tokens": 19447536.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5771461725234985, + "sampling/importance_sampling_ratio/mean": 1.0005178451538086, + "sampling/importance_sampling_ratio/min": 0.6526509523391724, + "sampling/sampling_logp_difference/max": 0.45561695098876953, + "sampling/sampling_logp_difference/mean": 0.013854766264557838, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 288.46875, + "completions/mean_terminated_length": 288.46875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.3499077558517456, + "epoch": 0.75, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02198641352607768, + "kl": 0.029847459867596626, + "learning_rate": 9.339004272812949e-07, + "loss": 0.0003, + "num_tokens": 19485646.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8180161714553833, + "sampling/importance_sampling_ratio/mean": 0.9999338388442993, + "sampling/importance_sampling_ratio/min": 0.4954237639904022, + "sampling/sampling_logp_difference/max": 0.7023417949676514, + "sampling/sampling_logp_difference/mean": 0.012782221660017967, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 187.453125, + "completions/mean_terminated_length": 187.453125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.40029793977737427, + "epoch": 0.7512254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04016545177690515, + "kl": 0.04481827840209007, + "learning_rate": 9.335459970887165e-07, + "loss": 0.0004, + "num_tokens": 19514507.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.563207983970642, + "sampling/importance_sampling_ratio/mean": 1.000853180885315, + "sampling/importance_sampling_ratio/min": 0.625471830368042, + "sampling/sampling_logp_difference/max": 0.46924901008605957, + "sampling/sampling_logp_difference/mean": 0.0157247856259346, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 244.90625, + "completions/mean_terminated_length": 244.90625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.38803213834762573, + "epoch": 0.7524509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02340689613733759, + "kl": 0.03086831234395504, + "learning_rate": 9.331906868252299e-07, + "loss": 0.0003, + "num_tokens": 19551413.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3239327669143677, + "sampling/importance_sampling_ratio/mean": 0.9999281764030457, + "sampling/importance_sampling_ratio/min": 0.6368730068206787, + "sampling/sampling_logp_difference/max": 0.4511849880218506, + "sampling/sampling_logp_difference/mean": 0.014674468897283077, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 239.265625, + "completions/mean_terminated_length": 239.265625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.39600038528442383, + "epoch": 0.7536764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028293929103437802, + "kl": 0.03476922959089279, + "learning_rate": 9.328344972120925e-07, + "loss": 0.0003, + "num_tokens": 19587126.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5374728441238403, + "sampling/importance_sampling_ratio/mean": 1.000159502029419, + "sampling/importance_sampling_ratio/min": 0.6482195258140564, + "sampling/sampling_logp_difference/max": 0.43352580070495605, + "sampling/sampling_logp_difference/mean": 0.014290915802121162, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 217.46875, + "completions/mean_terminated_length": 217.46875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.41973811388015747, + "epoch": 0.7549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9031670760037284, + "kl": 0.042274974286556244, + "learning_rate": 9.324774289723467e-07, + "loss": -0.0065, + "num_tokens": 19621748.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4025861024856567, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.6208490133285522, + "sampling/sampling_logp_difference/max": 0.4766674041748047, + "sampling/sampling_logp_difference/mean": 0.0144076282158494, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 243.609375, + "completions/mean_terminated_length": 243.609375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.31403636932373047, + "epoch": 0.7561274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5682684804508696, + "kl": 0.02878103218972683, + "learning_rate": 9.321194828308183e-07, + "loss": -0.0059, + "num_tokens": 19651995.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3645097017288208, + "sampling/importance_sampling_ratio/mean": 0.9994995594024658, + "sampling/importance_sampling_ratio/min": 0.62545245885849, + "sampling/sampling_logp_difference/max": 0.4692800045013428, + "sampling/sampling_logp_difference/mean": 0.012476968578994274, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 216.34375, + "completions/mean_terminated_length": 216.34375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4400501251220703, + "epoch": 0.7573529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7444790506242778, + "kl": 0.03834313154220581, + "learning_rate": 9.317606595141155e-07, + "loss": 0.0073, + "num_tokens": 19683073.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5071570873260498, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.6173469424247742, + "sampling/sampling_logp_difference/max": 0.48232412338256836, + "sampling/sampling_logp_difference/mean": 0.017789751291275024, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 218.328125, + "completions/mean_terminated_length": 218.328125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3517623543739319, + "epoch": 0.758578431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9659575956221583, + "kl": 0.03380831331014633, + "learning_rate": 9.314009597506265e-07, + "loss": 0.0099, + "num_tokens": 19710998.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6260311603546143, + "sampling/importance_sampling_ratio/mean": 1.0004644393920898, + "sampling/importance_sampling_ratio/min": 0.6954984068870544, + "sampling/sampling_logp_difference/max": 0.4861421585083008, + "sampling/sampling_logp_difference/mean": 0.014174041338264942, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 203.1875, + "completions/mean_terminated_length": 203.1875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.41839948296546936, + "epoch": 0.7598039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03355588255284209, + "kl": 0.038350485265254974, + "learning_rate": 9.310403842705194e-07, + "loss": 0.0004, + "num_tokens": 19740242.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3223214149475098, + "sampling/importance_sampling_ratio/mean": 0.9994974136352539, + "sampling/importance_sampling_ratio/min": 0.626263439655304, + "sampling/sampling_logp_difference/max": 0.4679841995239258, + "sampling/sampling_logp_difference/mean": 0.016148468479514122, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 253.9375, + "completions/mean_terminated_length": 253.9375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.4144206941127777, + "epoch": 0.7610294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02585294348524462, + "kl": 0.03346798196434975, + "learning_rate": 9.306789338057393e-07, + "loss": 0.0003, + "num_tokens": 19777390.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4269086122512817, + "sampling/importance_sampling_ratio/mean": 0.9993743896484375, + "sampling/importance_sampling_ratio/min": 0.6089751124382019, + "sampling/sampling_logp_difference/max": 0.49597787857055664, + "sampling/sampling_logp_difference/mean": 0.01556416880339384, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 205.484375, + "completions/mean_terminated_length": 205.484375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.35567742586135864, + "epoch": 0.7622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02446956236989713, + "kl": 0.030868055298924446, + "learning_rate": 9.303166090900081e-07, + "loss": 0.0003, + "num_tokens": 19805069.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.482351303100586, + "sampling/importance_sampling_ratio/mean": 1.0001112222671509, + "sampling/importance_sampling_ratio/min": 0.6232286691665649, + "sampling/sampling_logp_difference/max": 0.472841739654541, + "sampling/sampling_logp_difference/mean": 0.015565713867545128, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 179.296875, + "completions/mean_terminated_length": 179.296875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.37179112434387207, + "epoch": 0.7634803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026836675117688488, + "kl": 0.03199257701635361, + "learning_rate": 9.299534108588217e-07, + "loss": 0.0003, + "num_tokens": 19834064.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2945696115493774, + "sampling/importance_sampling_ratio/mean": 1.0003085136413574, + "sampling/importance_sampling_ratio/min": 0.7167701721191406, + "sampling/sampling_logp_difference/max": 0.3330000638961792, + "sampling/sampling_logp_difference/mean": 0.015221469104290009, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 193.78125, + "completions/mean_terminated_length": 193.78125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3535550832748413, + "epoch": 0.7647058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021615991914011708, + "kl": 0.026434477418661118, + "learning_rate": 9.295893398494497e-07, + "loss": 0.0003, + "num_tokens": 19865090.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7394191026687622, + "sampling/importance_sampling_ratio/mean": 1.0003294944763184, + "sampling/importance_sampling_ratio/min": 0.381239652633667, + "sampling/sampling_logp_difference/max": 0.9643270969390869, + "sampling/sampling_logp_difference/mean": 0.014835287816822529, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 204.671875, + "completions/mean_terminated_length": 204.671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3836597204208374, + "epoch": 0.7659313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1315714816878374, + "kl": 0.052863769233226776, + "learning_rate": 9.29224396800933e-07, + "loss": 0.0004, + "num_tokens": 19894317.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.589585542678833, + "sampling/importance_sampling_ratio/mean": 1.0002679824829102, + "sampling/importance_sampling_ratio/min": 0.4982438087463379, + "sampling/sampling_logp_difference/max": 0.6966657638549805, + "sampling/sampling_logp_difference/mean": 0.01588052697479725, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 184.109375, + "completions/mean_terminated_length": 184.109375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.3631596863269806, + "epoch": 0.7671568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024232601747706098, + "kl": 0.030641362071037292, + "learning_rate": 9.288585824540832e-07, + "loss": 0.0003, + "num_tokens": 19925364.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3705593347549438, + "sampling/importance_sampling_ratio/mean": 0.9995449185371399, + "sampling/importance_sampling_ratio/min": 0.6803103685379028, + "sampling/sampling_logp_difference/max": 0.38520610332489014, + "sampling/sampling_logp_difference/mean": 0.016184469684958458, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 139.046875, + "completions/mean_terminated_length": 139.046875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3215046525001526, + "epoch": 0.7683823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031171899339627742, + "kl": 0.03496295213699341, + "learning_rate": 9.284918975514797e-07, + "loss": 0.0003, + "num_tokens": 19948759.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3640942573547363, + "sampling/importance_sampling_ratio/mean": 1.000285029411316, + "sampling/importance_sampling_ratio/min": 0.6998546123504639, + "sampling/sampling_logp_difference/max": 0.3568826913833618, + "sampling/sampling_logp_difference/mean": 0.014979102648794651, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 209.171875, + "completions/mean_terminated_length": 209.171875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3832654654979706, + "epoch": 0.7696078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028730570751950475, + "kl": 0.03670923411846161, + "learning_rate": 9.281243428374701e-07, + "loss": 0.0004, + "num_tokens": 19975922.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5638998746871948, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.6141809225082397, + "sampling/sampling_logp_difference/max": 0.4874657988548279, + "sampling/sampling_logp_difference/mean": 0.015809854492545128, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 207.734375, + "completions/mean_terminated_length": 207.734375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.34388265013694763, + "epoch": 0.7708333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02739606455882557, + "kl": 0.02548704668879509, + "learning_rate": 9.277559190581669e-07, + "loss": 0.0002, + "num_tokens": 20013633.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5587172508239746, + "sampling/importance_sampling_ratio/mean": 1.0006846189498901, + "sampling/importance_sampling_ratio/min": 0.6293648481369019, + "sampling/sampling_logp_difference/max": 0.4630441665649414, + "sampling/sampling_logp_difference/mean": 0.014479327946901321, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 153.078125, + "completions/mean_terminated_length": 153.078125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.38949495553970337, + "epoch": 0.7720588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02700349170585698, + "kl": 0.0368114598095417, + "learning_rate": 9.273866269614473e-07, + "loss": 0.0004, + "num_tokens": 20038246.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.358526349067688, + "sampling/importance_sampling_ratio/mean": 0.9988189935684204, + "sampling/importance_sampling_ratio/min": 0.6289469003677368, + "sampling/sampling_logp_difference/max": 0.46370840072631836, + "sampling/sampling_logp_difference/mean": 0.01636747270822525, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 179.71875, + "completions/mean_terminated_length": 179.71875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4201076030731201, + "epoch": 0.7732843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029579892735116912, + "kl": 0.033645324409008026, + "learning_rate": 9.270164672969507e-07, + "loss": 0.0003, + "num_tokens": 20062724.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999145269393921, + "sampling/importance_sampling_ratio/min": 0.643266499042511, + "sampling/sampling_logp_difference/max": 0.7331724166870117, + "sampling/sampling_logp_difference/mean": 0.016295362263917923, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 173.03125, + "completions/mean_terminated_length": 173.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4026562571525574, + "epoch": 0.7745098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026486712340398812, + "kl": 0.0320625826716423, + "learning_rate": 9.266454408160777e-07, + "loss": 0.0003, + "num_tokens": 20090422.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5595238208770752, + "sampling/importance_sampling_ratio/mean": 1.0004431009292603, + "sampling/importance_sampling_ratio/min": 0.6771566867828369, + "sampling/sampling_logp_difference/max": 0.444380521774292, + "sampling/sampling_logp_difference/mean": 0.016752826049923897, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 150.65625, + "completions/mean_terminated_length": 150.65625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.2996982932090759, + "epoch": 0.7757352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03653067439866391, + "kl": 0.036317117512226105, + "learning_rate": 9.262735482719887e-07, + "loss": 0.0004, + "num_tokens": 20113648.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5977691411972046, + "sampling/importance_sampling_ratio/mean": 1.0001448392868042, + "sampling/importance_sampling_ratio/min": 0.6234261989593506, + "sampling/sampling_logp_difference/max": 0.47252488136291504, + "sampling/sampling_logp_difference/mean": 0.015517426654696465, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 199.0, + "completions/mean_terminated_length": 199.0, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3743230998516083, + "epoch": 0.7769607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027387618909479224, + "kl": 0.03293408080935478, + "learning_rate": 9.259007904196021e-07, + "loss": 0.0003, + "num_tokens": 20144928.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4939587116241455, + "sampling/importance_sampling_ratio/mean": 1.000123143196106, + "sampling/importance_sampling_ratio/min": 0.6464174389839172, + "sampling/sampling_logp_difference/max": 0.436309814453125, + "sampling/sampling_logp_difference/mean": 0.016843587160110474, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 226.59375, + "completions/mean_terminated_length": 226.59375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.4369206428527832, + "epoch": 0.7781862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8060044236768866, + "kl": 0.03182988241314888, + "learning_rate": 9.255271680155923e-07, + "loss": 0.0117, + "num_tokens": 20178966.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.445049524307251, + "sampling/importance_sampling_ratio/mean": 1.0002126693725586, + "sampling/importance_sampling_ratio/min": 0.6056285500526428, + "sampling/sampling_logp_difference/max": 0.501488447189331, + "sampling/sampling_logp_difference/mean": 0.016117552295327187, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 202.328125, + "completions/mean_terminated_length": 202.328125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.339857280254364, + "epoch": 0.7794117647058824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02036401823812101, + "kl": 0.021871455013751984, + "learning_rate": 9.251526818183896e-07, + "loss": 0.0002, + "num_tokens": 20213131.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5074704885482788, + "sampling/importance_sampling_ratio/mean": 1.0000642538070679, + "sampling/importance_sampling_ratio/min": 0.6117793917655945, + "sampling/sampling_logp_difference/max": 0.49138355255126953, + "sampling/sampling_logp_difference/mean": 0.014778957702219486, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 181.734375, + "completions/mean_terminated_length": 181.734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.31528449058532715, + "epoch": 0.7806372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02285719484122413, + "kl": 0.026205450296401978, + "learning_rate": 9.247773325881769e-07, + "loss": 0.0003, + "num_tokens": 20240298.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5207531452178955, + "sampling/importance_sampling_ratio/mean": 1.0005592107772827, + "sampling/importance_sampling_ratio/min": 0.6264617443084717, + "sampling/sampling_logp_difference/max": 0.4676675796508789, + "sampling/sampling_logp_difference/mean": 0.013754029758274555, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 189.734375, + "completions/mean_terminated_length": 189.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3886377811431885, + "epoch": 0.7818627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8941321714645836, + "kl": 0.03759963810443878, + "learning_rate": 9.244011210868895e-07, + "loss": -0.012, + "num_tokens": 20270441.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6039882898330688, + "sampling/importance_sampling_ratio/mean": 1.0004351139068604, + "sampling/importance_sampling_ratio/min": 0.5586837530136108, + "sampling/sampling_logp_difference/max": 0.5821716785430908, + "sampling/sampling_logp_difference/mean": 0.016681428998708725, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 199.15625, + "completions/mean_terminated_length": 199.15625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.30207303166389465, + "epoch": 0.7830882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02134380872013604, + "kl": 0.02383604645729065, + "learning_rate": 9.240240480782129e-07, + "loss": 0.0002, + "num_tokens": 20299667.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8481186628341675, + "sampling/importance_sampling_ratio/mean": 0.9998188614845276, + "sampling/importance_sampling_ratio/min": 0.5087823867797852, + "sampling/sampling_logp_difference/max": 0.6757348775863647, + "sampling/sampling_logp_difference/mean": 0.014480200596153736, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 194.59375, + "completions/mean_terminated_length": 194.59375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3990108370780945, + "epoch": 0.7843137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026674649992018927, + "kl": 0.03336282819509506, + "learning_rate": 9.236461143275815e-07, + "loss": 0.0003, + "num_tokens": 20332089.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6365165710449219, + "sampling/importance_sampling_ratio/mean": 0.9999001622200012, + "sampling/importance_sampling_ratio/min": 0.5185440182685852, + "sampling/sampling_logp_difference/max": 0.6567304134368896, + "sampling/sampling_logp_difference/mean": 0.01609973981976509, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 184.828125, + "completions/mean_terminated_length": 184.828125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.37371888756752014, + "epoch": 0.7855392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9130812228838805, + "kl": 0.030252164229750633, + "learning_rate": 9.232673206021767e-07, + "loss": -0.0069, + "num_tokens": 20358302.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.471134066581726, + "sampling/importance_sampling_ratio/mean": 0.9999719262123108, + "sampling/importance_sampling_ratio/min": 0.60378497838974, + "sampling/sampling_logp_difference/max": 0.5045371055603027, + "sampling/sampling_logp_difference/mean": 0.01553965825587511, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 184.4375, + "completions/mean_terminated_length": 184.4375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3092377483844757, + "epoch": 0.7867647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0526300537508844, + "kl": 0.028306782245635986, + "learning_rate": 9.228876676709259e-07, + "loss": -0.0171, + "num_tokens": 20385738.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.44679594039917, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 0.6616688370704651, + "sampling/sampling_logp_difference/max": 0.41299009323120117, + "sampling/sampling_logp_difference/mean": 0.01314954087138176, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 208.625, + "completions/mean_terminated_length": 208.625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3816605806350708, + "epoch": 0.7879901960784313, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0318468385581494, + "kl": 0.021000273525714874, + "learning_rate": 9.225071563045006e-07, + "loss": -0.0395, + "num_tokens": 20414258.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5868381261825562, + "sampling/importance_sampling_ratio/mean": 0.9999417066574097, + "sampling/importance_sampling_ratio/min": 0.620795726776123, + "sampling/sampling_logp_difference/max": 0.47675323486328125, + "sampling/sampling_logp_difference/mean": 0.015572399832308292, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 172.328125, + "completions/mean_terminated_length": 172.328125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3673475384712219, + "epoch": 0.7892156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02231174943566149, + "kl": 0.026067199185490608, + "learning_rate": 9.221257872753144e-07, + "loss": 0.0003, + "num_tokens": 20442167.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4737712144851685, + "sampling/importance_sampling_ratio/mean": 0.9999111890792847, + "sampling/importance_sampling_ratio/min": 0.6994224190711975, + "sampling/sampling_logp_difference/max": 0.38782453536987305, + "sampling/sampling_logp_difference/mean": 0.015629781410098076, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 142.546875, + "completions/mean_terminated_length": 142.546875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3521348536014557, + "epoch": 0.7904411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9623338309282699, + "kl": 0.03117131069302559, + "learning_rate": 9.217435613575226e-07, + "loss": 0.0133, + "num_tokens": 20466522.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6464656591415405, + "sampling/importance_sampling_ratio/mean": 0.9999923706054688, + "sampling/importance_sampling_ratio/min": 0.6097986102104187, + "sampling/sampling_logp_difference/max": 0.49863100051879883, + "sampling/sampling_logp_difference/mean": 0.01695406809449196, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 165.6875, + "completions/mean_terminated_length": 165.6875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3095352053642273, + "epoch": 0.7916666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02055465809870928, + "kl": 0.027063611894845963, + "learning_rate": 9.213604793270196e-07, + "loss": 0.0003, + "num_tokens": 20491782.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.550699234008789, + "sampling/importance_sampling_ratio/mean": 0.9997790455818176, + "sampling/importance_sampling_ratio/min": 0.6032088994979858, + "sampling/sampling_logp_difference/max": 0.5054917335510254, + "sampling/sampling_logp_difference/mean": 0.014584427699446678, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 204.4375, + "completions/mean_terminated_length": 204.4375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3655603528022766, + "epoch": 0.7928921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9800069770800625, + "kl": 0.02848714590072632, + "learning_rate": 9.209765419614373e-07, + "loss": 0.0286, + "num_tokens": 20519778.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5460152626037598, + "sampling/importance_sampling_ratio/mean": 0.999433159828186, + "sampling/importance_sampling_ratio/min": 0.6922650933265686, + "sampling/sampling_logp_difference/max": 0.4356808662414551, + "sampling/sampling_logp_difference/mean": 0.015107221901416779, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 190.421875, + "completions/mean_terminated_length": 190.421875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3832641541957855, + "epoch": 0.7941176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01899884698364976, + "kl": 0.02495635487139225, + "learning_rate": 9.205917500401447e-07, + "loss": 0.0002, + "num_tokens": 20550573.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3351675271987915, + "sampling/importance_sampling_ratio/mean": 0.9992302060127258, + "sampling/importance_sampling_ratio/min": 0.5054485201835632, + "sampling/sampling_logp_difference/max": 0.6823091506958008, + "sampling/sampling_logp_difference/mean": 0.014901909045875072, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 176.875, + "completions/mean_terminated_length": 176.875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.39816907048225403, + "epoch": 0.7953431372549019, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.928661629774423, + "kl": 0.02907564491033554, + "learning_rate": 9.202061043442447e-07, + "loss": 0.006, + "num_tokens": 20577269.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5745267868041992, + "sampling/importance_sampling_ratio/mean": 1.0006103515625, + "sampling/importance_sampling_ratio/min": 0.6622360348701477, + "sampling/sampling_logp_difference/max": 0.45395469665527344, + "sampling/sampling_logp_difference/mean": 0.01689119264483452, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 195.828125, + "completions/mean_terminated_length": 195.828125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.43053948879241943, + "epoch": 0.7965686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02537971558990522, + "kl": 0.02950497530400753, + "learning_rate": 9.198196056565738e-07, + "loss": 0.0003, + "num_tokens": 20608714.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.575938105583191, + "sampling/importance_sampling_ratio/mean": 0.9997596740722656, + "sampling/importance_sampling_ratio/min": 0.6518038511276245, + "sampling/sampling_logp_difference/max": 0.4548506736755371, + "sampling/sampling_logp_difference/mean": 0.017002377659082413, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 224.90625, + "completions/mean_terminated_length": 224.90625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3836837410926819, + "epoch": 0.7977941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8304670957527277, + "kl": 0.0255807526409626, + "learning_rate": 9.194322547616997e-07, + "loss": 0.0046, + "num_tokens": 20641076.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4579864740371704, + "sampling/importance_sampling_ratio/mean": 0.999410092830658, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.4826626777648926, + "sampling/sampling_logp_difference/mean": 0.014957036823034286, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 218.265625, + "completions/mean_terminated_length": 218.265625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.42147013545036316, + "epoch": 0.7990196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0434532236433227, + "kl": 0.034579478204250336, + "learning_rate": 9.190440524459202e-07, + "loss": 0.0209, + "num_tokens": 20675813.0, + "reward": 0.625, + "reward_std": 0.481805682182312, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.9799221754074097, + "sampling/importance_sampling_ratio/mean": 1.0001271963119507, + "sampling/importance_sampling_ratio/min": 0.5352949500083923, + "sampling/sampling_logp_difference/max": 0.6830575466156006, + "sampling/sampling_logp_difference/mean": 0.015538867563009262, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 242.40625, + "completions/mean_terminated_length": 242.40625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3450837731361389, + "epoch": 0.8002450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03538963112994367, + "kl": 0.02714606374502182, + "learning_rate": 9.186549994972616e-07, + "loss": 0.0003, + "num_tokens": 20710559.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.507158875465393, + "sampling/importance_sampling_ratio/mean": 0.9997929334640503, + "sampling/importance_sampling_ratio/min": 0.49667325615882874, + "sampling/sampling_logp_difference/max": 0.6998229026794434, + "sampling/sampling_logp_difference/mean": 0.013135567307472229, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 210.640625, + "completions/mean_terminated_length": 210.640625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.41749826073646545, + "epoch": 0.8014705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1118625188874587, + "kl": 0.0379713773727417, + "learning_rate": 9.182650967054766e-07, + "loss": 0.0278, + "num_tokens": 20743672.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6087944507598877, + "sampling/importance_sampling_ratio/mean": 0.9991387128829956, + "sampling/importance_sampling_ratio/min": 0.633405327796936, + "sampling/sampling_logp_difference/max": 0.47548508644104004, + "sampling/sampling_logp_difference/mean": 0.015086393803358078, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 228.484375, + "completions/mean_terminated_length": 228.484375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.4391304850578308, + "epoch": 0.8026960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6970682353865376, + "kl": 0.034562766551971436, + "learning_rate": 9.178743448620431e-07, + "loss": -0.0008, + "num_tokens": 20775783.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6267740726470947, + "sampling/importance_sampling_ratio/mean": 1.0002880096435547, + "sampling/importance_sampling_ratio/min": 0.6578302383422852, + "sampling/sampling_logp_difference/max": 0.4865989685058594, + "sampling/sampling_logp_difference/mean": 0.016575045883655548, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 223.140625, + "completions/mean_terminated_length": 223.140625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.3668602705001831, + "epoch": 0.803921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7347430934219253, + "kl": 0.025888238102197647, + "learning_rate": 9.174827447601627e-07, + "loss": -0.0145, + "num_tokens": 20806064.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.620834231376648, + "sampling/importance_sampling_ratio/mean": 0.9997379183769226, + "sampling/importance_sampling_ratio/min": 0.6337375044822693, + "sampling/sampling_logp_difference/max": 0.4829409122467041, + "sampling/sampling_logp_difference/mean": 0.013868054375052452, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 275.75, + "completions/mean_terminated_length": 275.75, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.37275058031082153, + "epoch": 0.8051470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1062569662147859, + "kl": 0.02562744729220867, + "learning_rate": 9.170902971947588e-07, + "loss": 0.0227, + "num_tokens": 20841632.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6554718017578125, + "sampling/importance_sampling_ratio/mean": 1.0002436637878418, + "sampling/importance_sampling_ratio/min": 0.6547819972038269, + "sampling/sampling_logp_difference/max": 0.5040860176086426, + "sampling/sampling_logp_difference/mean": 0.01474856398999691, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 178.78125, + "completions/mean_terminated_length": 178.78125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.30432000756263733, + "epoch": 0.8063725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02551123762861861, + "kl": 0.02973772957921028, + "learning_rate": 9.166970029624749e-07, + "loss": 0.0003, + "num_tokens": 20867026.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.465565800666809, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.7103471755981445, + "sampling/sampling_logp_difference/max": 0.3822413682937622, + "sampling/sampling_logp_difference/mean": 0.01424277201294899, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 192.328125, + "completions/mean_terminated_length": 192.328125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.30770403146743774, + "epoch": 0.8075980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02144686647787804, + "kl": 0.026862647384405136, + "learning_rate": 9.163028628616738e-07, + "loss": 0.0003, + "num_tokens": 20897511.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.420772910118103, + "sampling/importance_sampling_ratio/mean": 0.9997912645339966, + "sampling/importance_sampling_ratio/min": 0.6944699287414551, + "sampling/sampling_logp_difference/max": 0.3646063804626465, + "sampling/sampling_logp_difference/mean": 0.012521122582256794, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 233.71875, + "completions/mean_terminated_length": 233.71875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4509712755680084, + "epoch": 0.8088235294117647, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.276355707768248, + "kl": 0.0320962592959404, + "learning_rate": 9.159078776924345e-07, + "loss": -0.0953, + "num_tokens": 20930277.0, + "reward": -0.28125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.624848484992981, + "sampling/importance_sampling_ratio/mean": 1.0003355741500854, + "sampling/importance_sampling_ratio/min": 0.612565815448761, + "sampling/sampling_logp_difference/max": 0.4900989532470703, + "sampling/sampling_logp_difference/mean": 0.01768365502357483, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 246.140625, + "completions/mean_terminated_length": 246.140625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.31037211418151855, + "epoch": 0.8100490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.665667870226732, + "kl": 0.02321113646030426, + "learning_rate": 9.155120482565519e-07, + "loss": 0.0108, + "num_tokens": 20963422.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3408809900283813, + "sampling/importance_sampling_ratio/mean": 1.000077247619629, + "sampling/importance_sampling_ratio/min": 0.7097097039222717, + "sampling/sampling_logp_difference/max": 0.3428993225097656, + "sampling/sampling_logp_difference/mean": 0.011591589078307152, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 217.015625, + "completions/mean_terminated_length": 217.015625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3376811742782593, + "epoch": 0.8112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027645075247131553, + "kl": 0.03416694700717926, + "learning_rate": 9.15115375357535e-07, + "loss": 0.0003, + "num_tokens": 20992127.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6521793603897095, + "sampling/importance_sampling_ratio/mean": 0.9999262690544128, + "sampling/importance_sampling_ratio/min": 0.6254516839981079, + "sampling/sampling_logp_difference/max": 0.5020952224731445, + "sampling/sampling_logp_difference/mean": 0.014627622440457344, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 229.90625, + "completions/mean_terminated_length": 229.90625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.3859865069389343, + "epoch": 0.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034558263862194155, + "kl": 0.04741872847080231, + "learning_rate": 9.147178598006044e-07, + "loss": 0.0004, + "num_tokens": 21022841.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003008842468262, + "sampling/importance_sampling_ratio/min": 0.4793369174003601, + "sampling/sampling_logp_difference/max": 0.7353515625, + "sampling/sampling_logp_difference/mean": 0.014258160255849361, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 211.359375, + "completions/mean_terminated_length": 211.359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.42058265209198, + "epoch": 0.8137254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7761319641351087, + "kl": 0.042282477021217346, + "learning_rate": 9.143195023926917e-07, + "loss": -0.0085, + "num_tokens": 21050736.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.327823281288147, + "sampling/importance_sampling_ratio/mean": 0.9997660517692566, + "sampling/importance_sampling_ratio/min": 0.49541500210762024, + "sampling/sampling_logp_difference/max": 0.7023594379425049, + "sampling/sampling_logp_difference/mean": 0.016381915658712387, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 304.0, + "completions/mean_terminated_length": 304.0, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.4263555407524109, + "epoch": 0.8149509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.68782348424507, + "kl": 0.02612464874982834, + "learning_rate": 9.139203039424368e-07, + "loss": 0.009, + "num_tokens": 21087472.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.374288558959961, + "sampling/importance_sampling_ratio/mean": 1.0000532865524292, + "sampling/importance_sampling_ratio/min": 0.5362951159477234, + "sampling/sampling_logp_difference/max": 0.6230707168579102, + "sampling/sampling_logp_difference/mean": 0.01435888186097145, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 210.984375, + "completions/mean_terminated_length": 210.984375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.45306310057640076, + "epoch": 0.8161764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7615476265988077, + "kl": 0.04164658114314079, + "learning_rate": 9.135202652601876e-07, + "loss": -0.0096, + "num_tokens": 21116751.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6147634983062744, + "sampling/importance_sampling_ratio/mean": 0.9993770122528076, + "sampling/importance_sampling_ratio/min": 0.6319296360015869, + "sampling/sampling_logp_difference/max": 0.4791884422302246, + "sampling/sampling_logp_difference/mean": 0.017668476328253746, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 269.671875, + "completions/mean_terminated_length": 269.671875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3904714584350586, + "epoch": 0.8174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7452772038267703, + "kl": 0.028026167303323746, + "learning_rate": 9.131193871579974e-07, + "loss": -0.0206, + "num_tokens": 21161850.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9828075170516968, + "sampling/importance_sampling_ratio/mean": 1.0001330375671387, + "sampling/importance_sampling_ratio/min": 0.55345618724823, + "sampling/sampling_logp_difference/max": 0.6845138072967529, + "sampling/sampling_logp_difference/mean": 0.015064573846757412, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.40476080775260925, + "epoch": 0.8186274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032451223748506355, + "kl": 0.04005538672208786, + "learning_rate": 9.127176704496231e-07, + "loss": 0.0004, + "num_tokens": 21196226.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5751492977142334, + "sampling/importance_sampling_ratio/mean": 1.0002100467681885, + "sampling/importance_sampling_ratio/min": 0.70793217420578, + "sampling/sampling_logp_difference/max": 0.45434999465942383, + "sampling/sampling_logp_difference/mean": 0.016093598678708076, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 242.609375, + "completions/mean_terminated_length": 242.609375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.41240131855010986, + "epoch": 0.8198529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0744136939825115, + "kl": 0.0493934229016304, + "learning_rate": 9.123151159505241e-07, + "loss": 0.0095, + "num_tokens": 21225465.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5759365558624268, + "sampling/importance_sampling_ratio/mean": 0.9999208450317383, + "sampling/importance_sampling_ratio/min": 0.7214313745498657, + "sampling/sampling_logp_difference/max": 0.4548497200012207, + "sampling/sampling_logp_difference/mean": 0.015428552404046059, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 215.765625, + "completions/mean_terminated_length": 215.765625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3737298250198364, + "epoch": 0.821078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032785380224256894, + "kl": 0.0349334180355072, + "learning_rate": 9.119117244778607e-07, + "loss": 0.0004, + "num_tokens": 21259098.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 1.0001604557037354, + "sampling/importance_sampling_ratio/min": 0.5914656519889832, + "sampling/sampling_logp_difference/max": 0.5251517295837402, + "sampling/sampling_logp_difference/mean": 0.014463303610682487, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 208.890625, + "completions/mean_terminated_length": 208.890625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3668181896209717, + "epoch": 0.8223039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030875322481958026, + "kl": 0.03284909576177597, + "learning_rate": 9.115074968504921e-07, + "loss": 0.0003, + "num_tokens": 21294099.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4577935934066772, + "sampling/importance_sampling_ratio/mean": 1.0002777576446533, + "sampling/importance_sampling_ratio/min": 0.6196385025978088, + "sampling/sampling_logp_difference/max": 0.4786190986633301, + "sampling/sampling_logp_difference/mean": 0.014896114356815815, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 208.765625, + "completions/mean_terminated_length": 208.765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.33424144983291626, + "epoch": 0.8235294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024422238394924658, + "kl": 0.025998327881097794, + "learning_rate": 9.111024338889746e-07, + "loss": 0.0003, + "num_tokens": 21321604.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6071261167526245, + "sampling/importance_sampling_ratio/mean": 0.9993447065353394, + "sampling/importance_sampling_ratio/min": 0.6622374653816223, + "sampling/sampling_logp_difference/max": 0.47444748878479004, + "sampling/sampling_logp_difference/mean": 0.014403936453163624, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 189.578125, + "completions/mean_terminated_length": 189.578125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3311287760734558, + "epoch": 0.8247549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02488256089413651, + "kl": 0.027755137532949448, + "learning_rate": 9.106965364155605e-07, + "loss": 0.0003, + "num_tokens": 21353049.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6572978496551514, + "sampling/importance_sampling_ratio/mean": 0.9994744062423706, + "sampling/importance_sampling_ratio/min": 0.6177636384963989, + "sampling/sampling_logp_difference/max": 0.5051884651184082, + "sampling/sampling_logp_difference/mean": 0.014555609785020351, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 190.515625, + "completions/mean_terminated_length": 190.515625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.30811795592308044, + "epoch": 0.8259803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03174957866503794, + "kl": 0.03279242664575577, + "learning_rate": 9.102898052541957e-07, + "loss": 0.0003, + "num_tokens": 21385930.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4735252857208252, + "sampling/importance_sampling_ratio/mean": 1.0001640319824219, + "sampling/importance_sampling_ratio/min": 0.5007209181785583, + "sampling/sampling_logp_difference/max": 0.6917064189910889, + "sampling/sampling_logp_difference/mean": 0.014363166876137257, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 200.921875, + "completions/mean_terminated_length": 200.921875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.25038421154022217, + "epoch": 0.8272058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02248365752820782, + "kl": 0.025730226188898087, + "learning_rate": 9.09882241230519e-07, + "loss": 0.0002, + "num_tokens": 21414149.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6639342308044434, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.6247349381446838, + "sampling/sampling_logp_difference/max": 0.5091848373413086, + "sampling/sampling_logp_difference/mean": 0.012042131274938583, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 238.90625, + "completions/mean_terminated_length": 238.90625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.31127092242240906, + "epoch": 0.8284313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024899186794190512, + "kl": 0.024371564388275146, + "learning_rate": 9.094738451718593e-07, + "loss": 0.0002, + "num_tokens": 21445503.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5279085636138916, + "sampling/importance_sampling_ratio/mean": 1.0001455545425415, + "sampling/importance_sampling_ratio/min": 0.6630603075027466, + "sampling/sampling_logp_difference/max": 0.42389988899230957, + "sampling/sampling_logp_difference/mean": 0.01330435648560524, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 190.328125, + "completions/mean_terminated_length": 190.328125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.31943953037261963, + "epoch": 0.8296568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0860904211449993, + "kl": 0.04268050193786621, + "learning_rate": 9.09064617907235e-07, + "loss": 0.0004, + "num_tokens": 21471812.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6025196313858032, + "sampling/importance_sampling_ratio/mean": 0.9998210072517395, + "sampling/importance_sampling_ratio/min": 0.6037432551383972, + "sampling/sampling_logp_difference/max": 0.5046062469482422, + "sampling/sampling_logp_difference/mean": 0.01423419825732708, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 186.328125, + "completions/mean_terminated_length": 186.328125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2529739737510681, + "epoch": 0.8308823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0462886497662702, + "kl": 0.03442566841840744, + "learning_rate": 9.086545602673513e-07, + "loss": 0.0003, + "num_tokens": 21498521.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5877935886383057, + "sampling/importance_sampling_ratio/mean": 0.999727725982666, + "sampling/importance_sampling_ratio/min": 0.6185637712478638, + "sampling/sampling_logp_difference/max": 0.48035502433776855, + "sampling/sampling_logp_difference/mean": 0.012091949582099915, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 246.328125, + "completions/mean_terminated_length": 246.328125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.3578718602657318, + "epoch": 0.8321078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02742302173376564, + "kl": 0.03098582848906517, + "learning_rate": 9.082436730845993e-07, + "loss": 0.0003, + "num_tokens": 21532062.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4351998567581177, + "sampling/importance_sampling_ratio/mean": 0.9995269775390625, + "sampling/importance_sampling_ratio/min": 0.5791468024253845, + "sampling/sampling_logp_difference/max": 0.5461993217468262, + "sampling/sampling_logp_difference/mean": 0.01504305936396122, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 222.203125, + "completions/mean_terminated_length": 222.203125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.329847127199173, + "epoch": 0.8333333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022430099774986383, + "kl": 0.02566959336400032, + "learning_rate": 9.07831957193054e-07, + "loss": 0.0002, + "num_tokens": 21567995.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.547278642654419, + "sampling/importance_sampling_ratio/mean": 0.9995587468147278, + "sampling/importance_sampling_ratio/min": 0.6056252121925354, + "sampling/sampling_logp_difference/max": 0.5014939308166504, + "sampling/sampling_logp_difference/mean": 0.013780951499938965, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 184.859375, + "completions/mean_terminated_length": 184.859375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.36886081099510193, + "epoch": 0.8345588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030786434624787917, + "kl": 0.02804858796298504, + "learning_rate": 9.074194134284725e-07, + "loss": 0.0003, + "num_tokens": 21598322.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4174898862838745, + "sampling/importance_sampling_ratio/mean": 0.9999595880508423, + "sampling/importance_sampling_ratio/min": 0.6053468585014343, + "sampling/sampling_logp_difference/max": 0.5019536018371582, + "sampling/sampling_logp_difference/mean": 0.016476290300488472, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 222.90625, + "completions/mean_terminated_length": 222.90625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.40505316853523254, + "epoch": 0.8357843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9681077769908948, + "kl": 0.03483361750841141, + "learning_rate": 9.070060426282924e-07, + "loss": 0.0068, + "num_tokens": 21631676.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.464301347732544, + "sampling/importance_sampling_ratio/mean": 0.9999438524246216, + "sampling/importance_sampling_ratio/min": 0.6095887422561646, + "sampling/sampling_logp_difference/max": 0.49497079849243164, + "sampling/sampling_logp_difference/mean": 0.0161592997610569, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 218.234375, + "completions/mean_terminated_length": 218.234375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3744795322418213, + "epoch": 0.8370098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027247538606273457, + "kl": 0.02873339131474495, + "learning_rate": 9.065918456316303e-07, + "loss": 0.0003, + "num_tokens": 21660699.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5562453269958496, + "sampling/importance_sampling_ratio/mean": 0.9999843835830688, + "sampling/importance_sampling_ratio/min": 0.6183509826660156, + "sampling/sampling_logp_difference/max": 0.4806990623474121, + "sampling/sampling_logp_difference/mean": 0.015885494649410248, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 225.359375, + "completions/mean_terminated_length": 225.359375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.31745287775993347, + "epoch": 0.8382352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01958112477160249, + "kl": 0.024302463978528976, + "learning_rate": 9.061768232792802e-07, + "loss": 0.0002, + "num_tokens": 21700386.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.608885407447815, + "sampling/importance_sampling_ratio/mean": 0.9997677803039551, + "sampling/importance_sampling_ratio/min": 0.49139276146888733, + "sampling/sampling_logp_difference/max": 0.7105115652084351, + "sampling/sampling_logp_difference/mean": 0.013085578568279743, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 212.0, + "completions/mean_terminated_length": 212.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3961857855319977, + "epoch": 0.8394607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8216536330980322, + "kl": 0.036479175090789795, + "learning_rate": 9.057609764137109e-07, + "loss": 0.0295, + "num_tokens": 21733874.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5302457809448242, + "sampling/importance_sampling_ratio/mean": 0.9998958110809326, + "sampling/importance_sampling_ratio/min": 0.6393280625343323, + "sampling/sampling_logp_difference/max": 0.44733762741088867, + "sampling/sampling_logp_difference/mean": 0.015596205368638039, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 238.765625, + "completions/mean_terminated_length": 238.765625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.4261338710784912, + "epoch": 0.8406862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8447611894538987, + "kl": 0.03494609519839287, + "learning_rate": 9.053443058790651e-07, + "loss": 0.0277, + "num_tokens": 21768451.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3671236038208008, + "sampling/importance_sampling_ratio/mean": 0.9995797872543335, + "sampling/importance_sampling_ratio/min": 0.6059609055519104, + "sampling/sampling_logp_difference/max": 0.5009398460388184, + "sampling/sampling_logp_difference/mean": 0.016068704426288605, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 208.75, + "completions/mean_terminated_length": 208.75, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4047449827194214, + "epoch": 0.8419117647058824, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8108837008270493, + "kl": 0.03250086307525635, + "learning_rate": 9.049268125211575e-07, + "loss": 0.003, + "num_tokens": 21799091.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5083317756652832, + "sampling/importance_sampling_ratio/mean": 0.9995611906051636, + "sampling/importance_sampling_ratio/min": 0.6117785573005676, + "sampling/sampling_logp_difference/max": 0.49138498306274414, + "sampling/sampling_logp_difference/mean": 0.015563691966235638, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 162.953125, + "completions/mean_terminated_length": 162.953125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2634103298187256, + "epoch": 0.8431372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03276185420745991, + "kl": 0.034213632345199585, + "learning_rate": 9.045084971874737e-07, + "loss": 0.0003, + "num_tokens": 21826848.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.654000997543335, + "sampling/importance_sampling_ratio/mean": 1.000110149383545, + "sampling/importance_sampling_ratio/min": 0.5676656365394592, + "sampling/sampling_logp_difference/max": 0.5662226676940918, + "sampling/sampling_logp_difference/mean": 0.013691332191228867, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 181.28125, + "completions/mean_terminated_length": 181.28125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3478059768676758, + "epoch": 0.8443627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029273529644389087, + "kl": 0.03276558965444565, + "learning_rate": 9.040893607271668e-07, + "loss": 0.0003, + "num_tokens": 21863346.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6288442611694336, + "sampling/importance_sampling_ratio/mean": 1.0005204677581787, + "sampling/importance_sampling_ratio/min": 0.6104415655136108, + "sampling/sampling_logp_difference/max": 0.4935727119445801, + "sampling/sampling_logp_difference/mean": 0.015371044166386127, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 197.90625, + "completions/mean_terminated_length": 197.90625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3709610402584076, + "epoch": 0.8455882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023464229306227536, + "kl": 0.030765000730752945, + "learning_rate": 9.036694039910576e-07, + "loss": 0.0003, + "num_tokens": 21891996.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.613331913948059, + "sampling/importance_sampling_ratio/mean": 0.9996424913406372, + "sampling/importance_sampling_ratio/min": 0.6932787895202637, + "sampling/sampling_logp_difference/max": 0.4783015251159668, + "sampling/sampling_logp_difference/mean": 0.01533258706331253, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 196.171875, + "completions/mean_terminated_length": 196.171875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.38342106342315674, + "epoch": 0.8468137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03526812413441555, + "kl": 0.03387913852930069, + "learning_rate": 9.032486278316313e-07, + "loss": 0.0003, + "num_tokens": 21922263.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.413758635520935, + "sampling/importance_sampling_ratio/mean": 0.9994491934776306, + "sampling/importance_sampling_ratio/min": 0.6389612555503845, + "sampling/sampling_logp_difference/max": 0.44791150093078613, + "sampling/sampling_logp_difference/mean": 0.015761565417051315, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 192.84375, + "completions/mean_terminated_length": 192.84375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3467373251914978, + "epoch": 0.8480392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03485957554369687, + "kl": 0.03310055658221245, + "learning_rate": 9.028270331030372e-07, + "loss": 0.0003, + "num_tokens": 21952701.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 0.9998960494995117, + "sampling/importance_sampling_ratio/min": 0.6819144487380981, + "sampling/sampling_logp_difference/max": 0.4682176113128662, + "sampling/sampling_logp_difference/mean": 0.014420264400541782, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 167.390625, + "completions/mean_terminated_length": 167.390625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3890190124511719, + "epoch": 0.8492647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041594942453069, + "kl": 0.03580023720860481, + "learning_rate": 9.024046206610857e-07, + "loss": 0.0004, + "num_tokens": 21982326.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4016826152801514, + "sampling/importance_sampling_ratio/mean": 0.9994091987609863, + "sampling/importance_sampling_ratio/min": 0.650191068649292, + "sampling/sampling_logp_difference/max": 0.43048906326293945, + "sampling/sampling_logp_difference/mean": 0.016755372285842896, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 191.515625, + "completions/mean_terminated_length": 191.515625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3270426392555237, + "epoch": 0.8504901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022579528594023877, + "kl": 0.03228536248207092, + "learning_rate": 9.019813913632475e-07, + "loss": 0.0003, + "num_tokens": 22012471.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4279792308807373, + "sampling/importance_sampling_ratio/mean": 0.9996547698974609, + "sampling/importance_sampling_ratio/min": 0.7065871953964233, + "sampling/sampling_logp_difference/max": 0.3562602996826172, + "sampling/sampling_logp_difference/mean": 0.015847191214561462, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 142.71875, + "completions/mean_terminated_length": 142.71875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.40136900544166565, + "epoch": 0.8517156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03651747452103777, + "kl": 0.044040828943252563, + "learning_rate": 9.015573460686509e-07, + "loss": 0.0004, + "num_tokens": 22038149.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7007126808166504, + "sampling/importance_sampling_ratio/mean": 1.0002636909484863, + "sampling/importance_sampling_ratio/min": 0.5775176286697388, + "sampling/sampling_logp_difference/max": 0.5490162372589111, + "sampling/sampling_logp_difference/mean": 0.01701734960079193, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 151.203125, + "completions/mean_terminated_length": 151.203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.39290058612823486, + "epoch": 0.8529411764705882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04166512395127129, + "kl": 0.04310157150030136, + "learning_rate": 9.011324856380813e-07, + "loss": 0.0004, + "num_tokens": 22064770.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6670409440994263, + "sampling/importance_sampling_ratio/mean": 0.9995830059051514, + "sampling/importance_sampling_ratio/min": 0.6389588117599487, + "sampling/sampling_logp_difference/max": 0.5110502243041992, + "sampling/sampling_logp_difference/mean": 0.01720668561756611, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 209.71875, + "completions/mean_terminated_length": 209.71875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4242228865623474, + "epoch": 0.8541666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021517881330219764, + "kl": 0.02575260028243065, + "learning_rate": 9.007068109339783e-07, + "loss": 0.0003, + "num_tokens": 22097184.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4351998567581177, + "sampling/importance_sampling_ratio/mean": 0.9997817277908325, + "sampling/importance_sampling_ratio/min": 0.6130290627479553, + "sampling/sampling_logp_difference/max": 0.48934292793273926, + "sampling/sampling_logp_difference/mean": 0.015529869124293327, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 172.390625, + "completions/mean_terminated_length": 172.390625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3806571960449219, + "epoch": 0.8553921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.049945500053586875, + "kl": 0.03458796441555023, + "learning_rate": 9.002803228204348e-07, + "loss": 0.0003, + "num_tokens": 22127657.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.649056315422058, + "sampling/importance_sampling_ratio/mean": 0.9996928572654724, + "sampling/importance_sampling_ratio/min": 0.6011093854904175, + "sampling/sampling_logp_difference/max": 0.5089783668518066, + "sampling/sampling_logp_difference/mean": 0.015567264519631863, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 192.421875, + "completions/mean_terminated_length": 192.421875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4347988963127136, + "epoch": 0.8566176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030727108984442423, + "kl": 0.0344550758600235, + "learning_rate": 8.998530221631941e-07, + "loss": 0.0003, + "num_tokens": 22160020.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3363691568374634, + "sampling/importance_sampling_ratio/mean": 0.9999052286148071, + "sampling/importance_sampling_ratio/min": 0.670637309551239, + "sampling/sampling_logp_difference/max": 0.39952683448791504, + "sampling/sampling_logp_difference/mean": 0.0158841609954834, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 219.109375, + "completions/mean_terminated_length": 219.109375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4007618725299835, + "epoch": 0.8578431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01855108143989355, + "kl": 0.023340201005339622, + "learning_rate": 8.994249098296502e-07, + "loss": 0.0002, + "num_tokens": 22191739.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4741108417510986, + "sampling/importance_sampling_ratio/mean": 1.0001301765441895, + "sampling/importance_sampling_ratio/min": 0.6716850996017456, + "sampling/sampling_logp_difference/max": 0.397965669631958, + "sampling/sampling_logp_difference/mean": 0.014817805960774422, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 163.5625, + "completions/mean_terminated_length": 163.5625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.43091028928756714, + "epoch": 0.8590686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03130822030377899, + "kl": 0.036232396960258484, + "learning_rate": 8.989959866888437e-07, + "loss": 0.0003, + "num_tokens": 22221871.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.785539984703064, + "sampling/importance_sampling_ratio/mean": 1.0007494688034058, + "sampling/importance_sampling_ratio/min": 0.6233208179473877, + "sampling/sampling_logp_difference/max": 0.5797208547592163, + "sampling/sampling_logp_difference/mean": 0.017153877764940262, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 154.9375, + "completions/mean_terminated_length": 154.9375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3444056510925293, + "epoch": 0.8602941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043817633280303046, + "kl": 0.040040526539087296, + "learning_rate": 8.985662536114612e-07, + "loss": 0.0004, + "num_tokens": 22251307.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5191256999969482, + "sampling/importance_sampling_ratio/mean": 0.9996485710144043, + "sampling/importance_sampling_ratio/min": 0.6330731511116028, + "sampling/sampling_logp_difference/max": 0.4571692943572998, + "sampling/sampling_logp_difference/mean": 0.01527867466211319, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 179.421875, + "completions/mean_terminated_length": 179.421875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.283399760723114, + "epoch": 0.8615196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022086809477837064, + "kl": 0.0240059532225132, + "learning_rate": 8.981357114698338e-07, + "loss": 0.0002, + "num_tokens": 22284678.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6264933347702026, + "sampling/importance_sampling_ratio/mean": 0.9999646544456482, + "sampling/importance_sampling_ratio/min": 0.662260890007019, + "sampling/sampling_logp_difference/max": 0.48642635345458984, + "sampling/sampling_logp_difference/mean": 0.012372169643640518, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 186.6875, + "completions/mean_terminated_length": 186.6875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4073829650878906, + "epoch": 0.8627450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021700756893971683, + "kl": 0.0328010693192482, + "learning_rate": 8.977043611379349e-07, + "loss": 0.0003, + "num_tokens": 22313810.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.577315330505371, + "sampling/importance_sampling_ratio/mean": 1.000518560409546, + "sampling/importance_sampling_ratio/min": 0.6396447420120239, + "sampling/sampling_logp_difference/max": 0.45572423934936523, + "sampling/sampling_logp_difference/mean": 0.0156413447111845, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 201.984375, + "completions/mean_terminated_length": 201.984375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.36387914419174194, + "epoch": 0.8639705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021705187236821282, + "kl": 0.026145074516534805, + "learning_rate": 8.972722034913781e-07, + "loss": 0.0003, + "num_tokens": 22350369.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4937084913253784, + "sampling/importance_sampling_ratio/mean": 0.9994643926620483, + "sampling/importance_sampling_ratio/min": 0.7072100043296814, + "sampling/sampling_logp_difference/max": 0.40126192569732666, + "sampling/sampling_logp_difference/mean": 0.015600104816257954, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 170.53125, + "completions/mean_terminated_length": 170.53125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4234261214733124, + "epoch": 0.8651960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020820011210816534, + "kl": 0.029060494154691696, + "learning_rate": 8.968392394074163e-07, + "loss": 0.0003, + "num_tokens": 22377347.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6623014211654663, + "sampling/importance_sampling_ratio/mean": 1.0004706382751465, + "sampling/importance_sampling_ratio/min": 0.6393036246299744, + "sampling/sampling_logp_difference/max": 0.5082030296325684, + "sampling/sampling_logp_difference/mean": 0.01729530841112137, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 184.671875, + "completions/mean_terminated_length": 184.671875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.41309821605682373, + "epoch": 0.866421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028351012774506905, + "kl": 0.03086160495877266, + "learning_rate": 8.964054697649388e-07, + "loss": 0.0003, + "num_tokens": 22408254.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.408115267753601, + "sampling/importance_sampling_ratio/mean": 1.0000227689743042, + "sampling/importance_sampling_ratio/min": 0.6796722412109375, + "sampling/sampling_logp_difference/max": 0.38614463806152344, + "sampling/sampling_logp_difference/mean": 0.016858208924531937, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 194.375, + "completions/mean_terminated_length": 194.375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.33535701036453247, + "epoch": 0.8676470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01831578375235787, + "kl": 0.02631985768675804, + "learning_rate": 8.959708954444708e-07, + "loss": 0.0003, + "num_tokens": 22434134.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.618519902229309, + "sampling/importance_sampling_ratio/mean": 1.0003108978271484, + "sampling/importance_sampling_ratio/min": 0.6791917681694031, + "sampling/sampling_logp_difference/max": 0.48151206970214844, + "sampling/sampling_logp_difference/mean": 0.014257533475756645, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 172.34375, + "completions/mean_terminated_length": 172.34375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4500412344932556, + "epoch": 0.8688725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021215947051883795, + "kl": 0.027528800070285797, + "learning_rate": 8.955355173281707e-07, + "loss": 0.0003, + "num_tokens": 22460412.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3008592128753662, + "sampling/importance_sampling_ratio/mean": 1.000032901763916, + "sampling/importance_sampling_ratio/min": 0.6282380819320679, + "sampling/sampling_logp_difference/max": 0.46483612060546875, + "sampling/sampling_logp_difference/mean": 0.019712721928954124, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 184.609375, + "completions/mean_terminated_length": 184.609375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3844994902610779, + "epoch": 0.8700980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01740447594554791, + "kl": 0.02519618347287178, + "learning_rate": 8.95099336299828e-07, + "loss": 0.0002, + "num_tokens": 22491043.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5583192110061646, + "sampling/importance_sampling_ratio/mean": 1.000016450881958, + "sampling/importance_sampling_ratio/min": 0.5772902369499207, + "sampling/sampling_logp_difference/max": 0.5494101643562317, + "sampling/sampling_logp_difference/mean": 0.015510272234678268, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 200.609375, + "completions/mean_terminated_length": 200.609375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.39858540892601013, + "epoch": 0.8713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040457987331105866, + "kl": 0.04456058144569397, + "learning_rate": 8.946623532448631e-07, + "loss": 0.0004, + "num_tokens": 22522330.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.495909333229065, + "sampling/importance_sampling_ratio/mean": 1.000077486038208, + "sampling/importance_sampling_ratio/min": 0.6124499440193176, + "sampling/sampling_logp_difference/max": 0.49028801918029785, + "sampling/sampling_logp_difference/mean": 0.016100822016596794, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 188.9375, + "completions/mean_terminated_length": 188.9375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3326869606971741, + "epoch": 0.8725490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01630484328654411, + "kl": 0.024245694279670715, + "learning_rate": 8.942245690503238e-07, + "loss": 0.0002, + "num_tokens": 22550214.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5851231813430786, + "sampling/importance_sampling_ratio/mean": 1.0000001192092896, + "sampling/importance_sampling_ratio/min": 0.6805460453033447, + "sampling/sampling_logp_difference/max": 0.4606621265411377, + "sampling/sampling_logp_difference/mean": 0.013762826099991798, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 193.09375, + "completions/mean_terminated_length": 193.09375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.31201836466789246, + "epoch": 0.8737745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016822303110914344, + "kl": 0.02444467693567276, + "learning_rate": 8.937859846048842e-07, + "loss": 0.0002, + "num_tokens": 22579228.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.637949824333191, + "sampling/importance_sampling_ratio/mean": 1.0003360509872437, + "sampling/importance_sampling_ratio/min": 0.635092556476593, + "sampling/sampling_logp_difference/max": 0.49344539642333984, + "sampling/sampling_logp_difference/mean": 0.012438100762665272, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 231.6875, + "completions/mean_terminated_length": 231.6875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4103919267654419, + "epoch": 0.875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6983983579036075, + "kl": 0.026166250929236412, + "learning_rate": 8.933466007988429e-07, + "loss": -0.0295, + "num_tokens": 22610808.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.318074107170105, + "sampling/importance_sampling_ratio/mean": 0.9994531273841858, + "sampling/importance_sampling_ratio/min": 0.6622469425201416, + "sampling/sampling_logp_difference/max": 0.41211676597595215, + "sampling/sampling_logp_difference/mean": 0.014455530792474747, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 148.421875, + "completions/mean_terminated_length": 148.421875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.33341044187545776, + "epoch": 0.8762254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02867520429805719, + "kl": 0.03824207931756973, + "learning_rate": 8.929064185241212e-07, + "loss": 0.0003, + "num_tokens": 22632435.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5465291738510132, + "sampling/importance_sampling_ratio/mean": 0.9997183084487915, + "sampling/importance_sampling_ratio/min": 0.7638641595840454, + "sampling/sampling_logp_difference/max": 0.43601322174072266, + "sampling/sampling_logp_difference/mean": 0.014888597652316093, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 182.21875, + "completions/mean_terminated_length": 182.21875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3266327381134033, + "epoch": 0.8774509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018687077576912053, + "kl": 0.02686445415019989, + "learning_rate": 8.924654386742611e-07, + "loss": 0.0002, + "num_tokens": 22660817.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6115307807922363, + "sampling/importance_sampling_ratio/mean": 0.999614953994751, + "sampling/importance_sampling_ratio/min": 0.682086169719696, + "sampling/sampling_logp_difference/max": 0.477184534072876, + "sampling/sampling_logp_difference/mean": 0.013707519508898258, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 173.8125, + "completions/mean_terminated_length": 173.8125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.36124658584594727, + "epoch": 0.8786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01949862419535672, + "kl": 0.03010931983590126, + "learning_rate": 8.920236621444242e-07, + "loss": 0.0003, + "num_tokens": 22689605.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4058588743209839, + "sampling/importance_sampling_ratio/mean": 0.9999063014984131, + "sampling/importance_sampling_ratio/min": 0.6182796359062195, + "sampling/sampling_logp_difference/max": 0.48081445693969727, + "sampling/sampling_logp_difference/mean": 0.014477972872555256, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 179.0625, + "completions/mean_terminated_length": 179.0625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4190867245197296, + "epoch": 0.8799019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022864716090772563, + "kl": 0.02433510683476925, + "learning_rate": 8.915810898313884e-07, + "loss": 0.0002, + "num_tokens": 22720873.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3492519855499268, + "sampling/importance_sampling_ratio/mean": 0.9995954632759094, + "sampling/importance_sampling_ratio/min": 0.695496678352356, + "sampling/sampling_logp_difference/max": 0.36312901973724365, + "sampling/sampling_logp_difference/mean": 0.016453402116894722, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 215.375, + "completions/mean_terminated_length": 215.375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4043603539466858, + "epoch": 0.8811274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02135247048031107, + "kl": 0.024310391396284103, + "learning_rate": 8.911377226335478e-07, + "loss": 0.0002, + "num_tokens": 22756881.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3832709789276123, + "sampling/importance_sampling_ratio/mean": 1.0000874996185303, + "sampling/importance_sampling_ratio/min": 0.7845126986503601, + "sampling/sampling_logp_difference/max": 0.3244509696960449, + "sampling/sampling_logp_difference/mean": 0.014155912213027477, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 210.96875, + "completions/mean_terminated_length": 210.96875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3904402256011963, + "epoch": 0.8823529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8236928016307357, + "kl": 0.031219035387039185, + "learning_rate": 8.906935614509095e-07, + "loss": -0.0014, + "num_tokens": 22786095.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4844071865081787, + "sampling/importance_sampling_ratio/mean": 1.0000147819519043, + "sampling/importance_sampling_ratio/min": 0.5703468918800354, + "sampling/sampling_logp_difference/max": 0.5615105628967285, + "sampling/sampling_logp_difference/mean": 0.015508387237787247, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 203.671875, + "completions/mean_terminated_length": 203.671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.5759321451187134, + "epoch": 0.883578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027769155470801124, + "kl": 0.03423618897795677, + "learning_rate": 8.902486071850926e-07, + "loss": 0.0003, + "num_tokens": 22822042.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5285859107971191, + "sampling/importance_sampling_ratio/mean": 0.999665379524231, + "sampling/importance_sampling_ratio/min": 0.7155121564865112, + "sampling/sampling_logp_difference/max": 0.4243431091308594, + "sampling/sampling_logp_difference/mean": 0.01961427554488182, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 186.640625, + "completions/mean_terminated_length": 186.640625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4677279591560364, + "epoch": 0.8848039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02152396847738153, + "kl": 0.027463074773550034, + "learning_rate": 8.89802860739326e-07, + "loss": 0.0003, + "num_tokens": 22854691.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2895188331604004, + "sampling/importance_sampling_ratio/mean": 1.0000085830688477, + "sampling/importance_sampling_ratio/min": 0.6262801289558411, + "sampling/sampling_logp_difference/max": 0.4679574966430664, + "sampling/sampling_logp_difference/mean": 0.01694682240486145, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 236.34375, + "completions/mean_terminated_length": 236.34375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3707200884819031, + "epoch": 0.8860294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02839822055715672, + "kl": 0.025810226798057556, + "learning_rate": 8.89356323018447e-07, + "loss": 0.0002, + "num_tokens": 22889017.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5062004327774048, + "sampling/importance_sampling_ratio/mean": 1.0002585649490356, + "sampling/importance_sampling_ratio/min": 0.608048677444458, + "sampling/sampling_logp_difference/max": 0.4975004196166992, + "sampling/sampling_logp_difference/mean": 0.013876675628125668, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 171.015625, + "completions/mean_terminated_length": 171.015625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.43813496828079224, + "epoch": 0.8872549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9518328097802756, + "kl": 0.043780501931905746, + "learning_rate": 8.889089949288986e-07, + "loss": 0.0065, + "num_tokens": 22914074.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6135611534118652, + "sampling/importance_sampling_ratio/mean": 1.0000569820404053, + "sampling/importance_sampling_ratio/min": 0.7246251106262207, + "sampling/sampling_logp_difference/max": 0.47844362258911133, + "sampling/sampling_logp_difference/mean": 0.01806015707552433, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 190.234375, + "completions/mean_terminated_length": 190.234375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.2979552745819092, + "epoch": 0.8884803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021359864558259008, + "kl": 0.025271622464060783, + "learning_rate": 8.884608773787288e-07, + "loss": 0.0002, + "num_tokens": 22939689.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.632039189338684, + "sampling/importance_sampling_ratio/mean": 0.9996598362922668, + "sampling/importance_sampling_ratio/min": 0.6387041211128235, + "sampling/sampling_logp_difference/max": 0.48983025550842285, + "sampling/sampling_logp_difference/mean": 0.013333464972674847, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 201.703125, + "completions/mean_terminated_length": 201.703125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.40966618061065674, + "epoch": 0.8897058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019191291626506117, + "kl": 0.025096643716096878, + "learning_rate": 8.880119712775875e-07, + "loss": 0.0002, + "num_tokens": 22971030.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6651651859283447, + "sampling/importance_sampling_ratio/mean": 0.9998248219490051, + "sampling/importance_sampling_ratio/min": 0.6176443696022034, + "sampling/sampling_logp_difference/max": 0.5099244117736816, + "sampling/sampling_logp_difference/mean": 0.014878641813993454, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 238.546875, + "completions/mean_terminated_length": 238.546875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3977225422859192, + "epoch": 0.8909313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018954677765834885, + "kl": 0.027560057118535042, + "learning_rate": 8.875622775367259e-07, + "loss": 0.0003, + "num_tokens": 23002217.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5309392213821411, + "sampling/importance_sampling_ratio/mean": 1.0005265474319458, + "sampling/importance_sampling_ratio/min": 0.6823685169219971, + "sampling/sampling_logp_difference/max": 0.42588138580322266, + "sampling/sampling_logp_difference/mean": 0.014687771908938885, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 245.65625, + "completions/mean_terminated_length": 245.65625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.4387362003326416, + "epoch": 0.8921568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01755373771492126, + "kl": 0.01948336698114872, + "learning_rate": 8.871117970689937e-07, + "loss": 0.0002, + "num_tokens": 23036963.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5965895652770996, + "sampling/importance_sampling_ratio/mean": 1.0000070333480835, + "sampling/importance_sampling_ratio/min": 0.6271955370903015, + "sampling/sampling_logp_difference/max": 0.46786975860595703, + "sampling/sampling_logp_difference/mean": 0.01525543536990881, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 187.3125, + "completions/mean_terminated_length": 187.3125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2976965308189392, + "epoch": 0.8933823529411765, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01756662120639696, + "kl": 0.022048771381378174, + "learning_rate": 8.866605307888376e-07, + "loss": 0.0002, + "num_tokens": 23065943.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5641213655471802, + "sampling/importance_sampling_ratio/mean": 1.000281572341919, + "sampling/importance_sampling_ratio/min": 0.6271852254867554, + "sampling/sampling_logp_difference/max": 0.46651339530944824, + "sampling/sampling_logp_difference/mean": 0.012067212723195553, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 197.359375, + "completions/mean_terminated_length": 197.359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.373897910118103, + "epoch": 0.8946078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014672301147486449, + "kl": 0.019550330936908722, + "learning_rate": 8.862084796122997e-07, + "loss": 0.0002, + "num_tokens": 23097790.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6513066291809082, + "sampling/importance_sampling_ratio/mean": 1.0000605583190918, + "sampling/importance_sampling_ratio/min": 0.6707313060760498, + "sampling/sampling_logp_difference/max": 0.5015668869018555, + "sampling/sampling_logp_difference/mean": 0.015741368755698204, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 179.703125, + "completions/mean_terminated_length": 179.703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.46061819791793823, + "epoch": 0.8958333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02386264189946869, + "kl": 0.02802673727273941, + "learning_rate": 8.857556444570153e-07, + "loss": 0.0003, + "num_tokens": 23127067.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.405434250831604, + "sampling/importance_sampling_ratio/mean": 0.9999182224273682, + "sampling/importance_sampling_ratio/min": 0.6932373642921448, + "sampling/sampling_logp_difference/max": 0.3663828372955322, + "sampling/sampling_logp_difference/mean": 0.01652100682258606, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 206.84375, + "completions/mean_terminated_length": 206.84375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3515208959579468, + "epoch": 0.8970588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019219332441676965, + "kl": 0.02097495086491108, + "learning_rate": 8.853020262422109e-07, + "loss": 0.0002, + "num_tokens": 23154481.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002868175506592, + "sampling/importance_sampling_ratio/min": 0.6202810406684875, + "sampling/sampling_logp_difference/max": 0.7561249732971191, + "sampling/sampling_logp_difference/mean": 0.014154670760035515, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 219.390625, + "completions/mean_terminated_length": 219.390625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.301144003868103, + "epoch": 0.8982843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8816825728555113, + "kl": 0.02029554918408394, + "learning_rate": 8.84847625888703e-07, + "loss": -0.0061, + "num_tokens": 23189914.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3341046571731567, + "sampling/importance_sampling_ratio/mean": 1.000375509262085, + "sampling/importance_sampling_ratio/min": 0.6811178922653198, + "sampling/sampling_logp_difference/max": 0.3840198516845703, + "sampling/sampling_logp_difference/mean": 0.012259600684046745, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 186.8125, + "completions/mean_terminated_length": 186.8125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3824283480644226, + "epoch": 0.8995098039215687, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0885734340783775, + "kl": 0.027387483045458794, + "learning_rate": 8.843924443188953e-07, + "loss": -0.0531, + "num_tokens": 23221278.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5175509452819824, + "sampling/importance_sampling_ratio/mean": 0.9999219179153442, + "sampling/importance_sampling_ratio/min": 0.6095960140228271, + "sampling/sampling_logp_difference/max": 0.49495887756347656, + "sampling/sampling_logp_difference/mean": 0.014254633337259293, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 214.6875, + "completions/mean_terminated_length": 214.6875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.40970197319984436, + "epoch": 0.9007352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02078595852834981, + "kl": 0.03226561099290848, + "learning_rate": 8.839364824567775e-07, + "loss": 0.0003, + "num_tokens": 23251690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 0.9999431371688843, + "sampling/importance_sampling_ratio/min": 0.6202805638313293, + "sampling/sampling_logp_difference/max": 0.4775834083557129, + "sampling/sampling_logp_difference/mean": 0.016174018383026123, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 260.203125, + "completions/mean_terminated_length": 260.203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4016256630420685, + "epoch": 0.9019607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7728898684931877, + "kl": 0.03287380188703537, + "learning_rate": 8.834797412279235e-07, + "loss": -0.0102, + "num_tokens": 23289127.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.9951763153076172, + "sampling/importance_sampling_ratio/mean": 0.9996899366378784, + "sampling/importance_sampling_ratio/min": 0.42026883363723755, + "sampling/sampling_logp_difference/max": 0.8668606281280518, + "sampling/sampling_logp_difference/mean": 0.014560644514858723, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 150.21875, + "completions/mean_terminated_length": 150.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.29902899265289307, + "epoch": 0.9031862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027955462755797637, + "kl": 0.03348477557301521, + "learning_rate": 8.83022221559489e-07, + "loss": 0.0003, + "num_tokens": 23313061.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6089363098144531, + "sampling/importance_sampling_ratio/mean": 0.9994817972183228, + "sampling/importance_sampling_ratio/min": 0.417529821395874, + "sampling/sampling_logp_difference/max": 0.8733993172645569, + "sampling/sampling_logp_difference/mean": 0.014205368235707283, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 192.84375, + "completions/mean_terminated_length": 192.84375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3875751793384552, + "epoch": 0.9044117647058824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03185755440779379, + "kl": 0.026237674057483673, + "learning_rate": 8.825639243802098e-07, + "loss": 0.0003, + "num_tokens": 23347659.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4153696298599243, + "sampling/importance_sampling_ratio/mean": 0.9998494386672974, + "sampling/importance_sampling_ratio/min": 0.6408570408821106, + "sampling/sampling_logp_difference/max": 0.4449489116668701, + "sampling/sampling_logp_difference/mean": 0.016096360981464386, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 212.75, + "completions/mean_terminated_length": 212.75, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2933084964752197, + "epoch": 0.9056372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023886017411667487, + "kl": 0.028503376990556717, + "learning_rate": 8.821048506204005e-07, + "loss": 0.0003, + "num_tokens": 23375371.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5751630067825317, + "sampling/importance_sampling_ratio/mean": 0.9999306201934814, + "sampling/importance_sampling_ratio/min": 0.610644519329071, + "sampling/sampling_logp_difference/max": 0.4932403564453125, + "sampling/sampling_logp_difference/mean": 0.013522179797291756, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 257.125, + "completions/mean_terminated_length": 257.125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.42164430022239685, + "epoch": 0.9068627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7956254619530746, + "kl": 0.02659602463245392, + "learning_rate": 8.816450012119513e-07, + "loss": -0.0001, + "num_tokens": 23415395.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6588774919509888, + "sampling/importance_sampling_ratio/mean": 1.0001760721206665, + "sampling/importance_sampling_ratio/min": 0.5339854955673218, + "sampling/sampling_logp_difference/max": 0.6273865699768066, + "sampling/sampling_logp_difference/mean": 0.015276629477739334, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 188.8125, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3506987690925598, + "epoch": 0.9080882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026828614541766244, + "kl": 0.024205820634961128, + "learning_rate": 8.811843770883276e-07, + "loss": 0.0002, + "num_tokens": 23445351.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4376978874206543, + "sampling/importance_sampling_ratio/mean": 0.9997704029083252, + "sampling/importance_sampling_ratio/min": 0.705346405506134, + "sampling/sampling_logp_difference/max": 0.36304306983947754, + "sampling/sampling_logp_difference/mean": 0.01479363813996315, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 299.796875, + "completions/mean_terminated_length": 299.796875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.35800766944885254, + "epoch": 0.9093137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.633406713428934, + "kl": 0.02546757273375988, + "learning_rate": 8.807229791845671e-07, + "loss": -0.0173, + "num_tokens": 23484266.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000248670578003, + "sampling/importance_sampling_ratio/min": 0.572018563747406, + "sampling/sampling_logp_difference/max": 1.0540895462036133, + "sampling/sampling_logp_difference/mean": 0.014206867665052414, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 251.515625, + "completions/mean_terminated_length": 251.515625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.29566600918769836, + "epoch": 0.9105392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7766493417323004, + "kl": 0.03525124117732048, + "learning_rate": 8.802608084372785e-07, + "loss": 0.0325, + "num_tokens": 23521643.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4382350444793701, + "sampling/importance_sampling_ratio/mean": 0.9998162984848022, + "sampling/importance_sampling_ratio/min": 0.6290534138679504, + "sampling/sampling_logp_difference/max": 0.46353912353515625, + "sampling/sampling_logp_difference/mean": 0.012025854550302029, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 208.28125, + "completions/mean_terminated_length": 208.28125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.33061474561691284, + "epoch": 0.9117647058823529, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027698715442733026, + "kl": 0.02474607154726982, + "learning_rate": 8.79797865784639e-07, + "loss": 0.0002, + "num_tokens": 23552013.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.527300477027893, + "sampling/importance_sampling_ratio/mean": 0.9997114539146423, + "sampling/importance_sampling_ratio/min": 0.6178549528121948, + "sampling/sampling_logp_difference/max": 0.48150157928466797, + "sampling/sampling_logp_difference/mean": 0.013280758634209633, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 193.859375, + "completions/mean_terminated_length": 193.859375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3518263101577759, + "epoch": 0.9129901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024194303920341127, + "kl": 0.025767751038074493, + "learning_rate": 8.793341521663928e-07, + "loss": 0.0003, + "num_tokens": 23583124.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.593743085861206, + "sampling/importance_sampling_ratio/mean": 0.9996703267097473, + "sampling/importance_sampling_ratio/min": 0.6124204993247986, + "sampling/sampling_logp_difference/max": 0.49033617973327637, + "sampling/sampling_logp_difference/mean": 0.014404002577066422, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 214.96875, + "completions/mean_terminated_length": 214.96875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3150392770767212, + "epoch": 0.9142156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027045237899341455, + "kl": 0.021665578708052635, + "learning_rate": 8.788696685238494e-07, + "loss": 0.0002, + "num_tokens": 23615218.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5703420639038086, + "sampling/importance_sampling_ratio/mean": 0.9998134970664978, + "sampling/importance_sampling_ratio/min": 0.5153923630714417, + "sampling/sampling_logp_difference/max": 0.6628267765045166, + "sampling/sampling_logp_difference/mean": 0.01464426051825285, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 179.203125, + "completions/mean_terminated_length": 179.203125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2582736611366272, + "epoch": 0.9154411764705882, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02714215186518947, + "kl": 0.02816140279173851, + "learning_rate": 8.784044157998809e-07, + "loss": 0.0003, + "num_tokens": 23641231.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5971472263336182, + "sampling/importance_sampling_ratio/mean": 1.0004371404647827, + "sampling/importance_sampling_ratio/min": 0.6366870403289795, + "sampling/sampling_logp_difference/max": 0.4682190418243408, + "sampling/sampling_logp_difference/mean": 0.01274982187896967, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 255.578125, + "completions/mean_terminated_length": 255.578125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4026065468788147, + "epoch": 0.9166666666666666, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024676608713910596, + "kl": 0.03214331716299057, + "learning_rate": 8.779383949389208e-07, + "loss": 0.0003, + "num_tokens": 23676740.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5278124809265137, + "sampling/importance_sampling_ratio/mean": 0.9997199177742004, + "sampling/importance_sampling_ratio/min": 0.6298378109931946, + "sampling/sampling_logp_difference/max": 0.4622929096221924, + "sampling/sampling_logp_difference/mean": 0.015497921034693718, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 154.859375, + "completions/mean_terminated_length": 154.859375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.27072569727897644, + "epoch": 0.9178921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0374075047980671, + "kl": 0.025277458131313324, + "learning_rate": 8.774716068869623e-07, + "loss": 0.0002, + "num_tokens": 23702971.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6152385473251343, + "sampling/importance_sampling_ratio/mean": 0.9997353553771973, + "sampling/importance_sampling_ratio/min": 0.6209545135498047, + "sampling/sampling_logp_difference/max": 0.47948265075683594, + "sampling/sampling_logp_difference/mean": 0.013536088168621063, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 264.421875, + "completions/mean_terminated_length": 264.421875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.4094110131263733, + "epoch": 0.9191176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013892030016520967, + "kl": 0.01472385972738266, + "learning_rate": 8.770040525915553e-07, + "loss": 0.0001, + "num_tokens": 23750198.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000056505203247, + "sampling/importance_sampling_ratio/min": 0.6676850318908691, + "sampling/sampling_logp_difference/max": 0.822901725769043, + "sampling/sampling_logp_difference/mean": 0.014736359938979149, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 212.640625, + "completions/mean_terminated_length": 212.640625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.2965674102306366, + "epoch": 0.9203431372549019, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9046636615662998, + "kl": 0.02484903112053871, + "learning_rate": 8.765357330018055e-07, + "loss": 0.0088, + "num_tokens": 23780591.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5266870260238647, + "sampling/importance_sampling_ratio/mean": 1.0007375478744507, + "sampling/importance_sampling_ratio/min": 0.4810182452201843, + "sampling/sampling_logp_difference/max": 0.7318501472473145, + "sampling/sampling_logp_difference/mean": 0.013828756287693977, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 262.796875, + "completions/mean_terminated_length": 262.796875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.33337122201919556, + "epoch": 0.9215686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02823034514643041, + "kl": 0.021070297807455063, + "learning_rate": 8.760666490683719e-07, + "loss": 0.0002, + "num_tokens": 23814066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5928850173950195, + "sampling/importance_sampling_ratio/mean": 1.0001671314239502, + "sampling/importance_sampling_ratio/min": 0.6269896626472473, + "sampling/sampling_logp_difference/max": 0.4668252468109131, + "sampling/sampling_logp_difference/mean": 0.012249452993273735, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4141361713409424, + "epoch": 0.9227941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03437685367679887, + "kl": 0.03887910395860672, + "learning_rate": 8.755968017434651e-07, + "loss": 0.0004, + "num_tokens": 23842960.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5015095472335815, + "sampling/importance_sampling_ratio/mean": 1.0005022287368774, + "sampling/importance_sampling_ratio/min": 0.6262646317481995, + "sampling/sampling_logp_difference/max": 0.46798229217529297, + "sampling/sampling_logp_difference/mean": 0.018394464626908302, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 214.953125, + "completions/mean_terminated_length": 214.953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3201451897621155, + "epoch": 0.9240196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02504827027221983, + "kl": 0.0242147259414196, + "learning_rate": 8.751261919808457e-07, + "loss": 0.0002, + "num_tokens": 23877293.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9370697736740112, + "sampling/importance_sampling_ratio/mean": 1.000178575515747, + "sampling/importance_sampling_ratio/min": 0.4793161451816559, + "sampling/sampling_logp_difference/max": 0.7353949546813965, + "sampling/sampling_logp_difference/mean": 0.014398392289876938, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 232.921875, + "completions/mean_terminated_length": 232.921875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.2768540382385254, + "epoch": 0.9252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016273019864649428, + "kl": 0.020013771951198578, + "learning_rate": 8.746548207358215e-07, + "loss": 0.0002, + "num_tokens": 23915560.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5176751613616943, + "sampling/importance_sampling_ratio/mean": 1.0000295639038086, + "sampling/importance_sampling_ratio/min": 0.5838847756385803, + "sampling/sampling_logp_difference/max": 0.5380516052246094, + "sampling/sampling_logp_difference/mean": 0.012251168489456177, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 225.953125, + "completions/mean_terminated_length": 225.953125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.40648049116134644, + "epoch": 0.9264705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03658944231433625, + "kl": 0.03250942379236221, + "learning_rate": 8.741826889652463e-07, + "loss": 0.0003, + "num_tokens": 23953125.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6207877397537231, + "sampling/importance_sampling_ratio/mean": 0.9998102784156799, + "sampling/importance_sampling_ratio/min": 0.660457193851471, + "sampling/sampling_logp_difference/max": 0.4829123020172119, + "sampling/sampling_logp_difference/mean": 0.015902938321232796, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 180.171875, + "completions/mean_terminated_length": 180.171875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.31027036905288696, + "epoch": 0.9276960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018178486628944335, + "kl": 0.02138359844684601, + "learning_rate": 8.737097976275176e-07, + "loss": 0.0002, + "num_tokens": 23980368.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4353374242782593, + "sampling/importance_sampling_ratio/mean": 1.0002868175506592, + "sampling/importance_sampling_ratio/min": 0.6369235515594482, + "sampling/sampling_logp_difference/max": 0.45110559463500977, + "sampling/sampling_logp_difference/mean": 0.013565674424171448, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 226.859375, + "completions/mean_terminated_length": 226.859375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3295559883117676, + "epoch": 0.928921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015841653806567234, + "kl": 0.021238170564174652, + "learning_rate": 8.73236147682575e-07, + "loss": 0.0002, + "num_tokens": 24020183.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7261039018630981, + "sampling/importance_sampling_ratio/mean": 1.0000017881393433, + "sampling/importance_sampling_ratio/min": 0.6048555374145508, + "sampling/sampling_logp_difference/max": 0.5458667278289795, + "sampling/sampling_logp_difference/mean": 0.01359205599874258, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 223.1875, + "completions/mean_terminated_length": 223.1875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3196815252304077, + "epoch": 0.9301470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020711053312641645, + "kl": 0.02678016386926174, + "learning_rate": 8.727617400918978e-07, + "loss": 0.0003, + "num_tokens": 24054979.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7527730464935303, + "sampling/importance_sampling_ratio/mean": 1.0000622272491455, + "sampling/importance_sampling_ratio/min": 0.638654351234436, + "sampling/sampling_logp_difference/max": 0.5611990690231323, + "sampling/sampling_logp_difference/mean": 0.013337301090359688, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 207.40625, + "completions/mean_terminated_length": 207.40625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.41590484976768494, + "epoch": 0.9313725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025386860324877646, + "kl": 0.023337356746196747, + "learning_rate": 8.722865758185035e-07, + "loss": 0.0002, + "num_tokens": 24086061.0, + "reward": -1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007213354110718, + "sampling/importance_sampling_ratio/min": 0.47247007489204407, + "sampling/sampling_logp_difference/max": 0.824960470199585, + "sampling/sampling_logp_difference/mean": 0.016363630071282387, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 228.625, + "completions/mean_terminated_length": 228.625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3403834402561188, + "epoch": 0.9325980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7640502987624201, + "kl": 0.021827127784490585, + "learning_rate": 8.718106558269452e-07, + "loss": 0.0158, + "num_tokens": 24119653.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5976622104644775, + "sampling/importance_sampling_ratio/mean": 0.9999210238456726, + "sampling/importance_sampling_ratio/min": 0.608597457408905, + "sampling/sampling_logp_difference/max": 0.4965982437133789, + "sampling/sampling_logp_difference/mean": 0.012747423723340034, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 166.296875, + "completions/mean_terminated_length": 166.296875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3273610770702362, + "epoch": 0.9338235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02234249527444986, + "kl": 0.023769231513142586, + "learning_rate": 8.713339810833105e-07, + "loss": 0.0002, + "num_tokens": 24143720.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5283203125, + "sampling/importance_sampling_ratio/mean": 0.99969881772995, + "sampling/importance_sampling_ratio/min": 0.6343420147895813, + "sampling/sampling_logp_difference/max": 0.4551670551300049, + "sampling/sampling_logp_difference/mean": 0.01511424034833908, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 194.21875, + "completions/mean_terminated_length": 194.21875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.34374314546585083, + "epoch": 0.9350490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03413304169673714, + "kl": 0.02705732360482216, + "learning_rate": 8.708565525552189e-07, + "loss": 0.0003, + "num_tokens": 24174630.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.545061707496643, + "sampling/importance_sampling_ratio/mean": 1.0002961158752441, + "sampling/importance_sampling_ratio/min": 0.6778910756111145, + "sampling/sampling_logp_difference/max": 0.43506383895874023, + "sampling/sampling_logp_difference/mean": 0.014042122289538383, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 225.171875, + "completions/mean_terminated_length": 225.171875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3956797122955322, + "epoch": 0.9362745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9027058283169005, + "kl": 0.02620890364050865, + "learning_rate": 8.703783712118202e-07, + "loss": -0.0312, + "num_tokens": 24212449.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4614901542663574, + "sampling/importance_sampling_ratio/mean": 0.9998257160186768, + "sampling/importance_sampling_ratio/min": 0.6307316422462463, + "sampling/sampling_logp_difference/max": 0.4608747959136963, + "sampling/sampling_logp_difference/mean": 0.015541866421699524, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 193.09375, + "completions/mean_terminated_length": 193.09375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3058358430862427, + "epoch": 0.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020757007458023786, + "kl": 0.02144656702876091, + "learning_rate": 8.69899438023792e-07, + "loss": 0.0002, + "num_tokens": 24240967.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9106395244598389, + "sampling/importance_sampling_ratio/mean": 1.000170350074768, + "sampling/importance_sampling_ratio/min": 0.40289777517318726, + "sampling/sampling_logp_difference/max": 0.9090723991394043, + "sampling/sampling_logp_difference/mean": 0.0138449901714921, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 250.9375, + "completions/mean_terminated_length": 250.9375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.271751344203949, + "epoch": 0.9387254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012337520673509015, + "kl": 0.016083460301160812, + "learning_rate": 8.694197539633385e-07, + "loss": 0.0001, + "num_tokens": 24277011.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.406278133392334, + "sampling/importance_sampling_ratio/mean": 1.0002005100250244, + "sampling/importance_sampling_ratio/min": 0.6132686734199524, + "sampling/sampling_logp_difference/max": 0.4889521598815918, + "sampling/sampling_logp_difference/mean": 0.011555547825992107, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 167.65625, + "completions/mean_terminated_length": 167.65625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3901594281196594, + "epoch": 0.9399509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02606277068546319, + "kl": 0.02786336839199066, + "learning_rate": 8.689393200041878e-07, + "loss": 0.0003, + "num_tokens": 24307549.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6091108322143555, + "sampling/importance_sampling_ratio/mean": 1.0003968477249146, + "sampling/importance_sampling_ratio/min": 0.6550332903862, + "sampling/sampling_logp_difference/max": 0.47568178176879883, + "sampling/sampling_logp_difference/mean": 0.015018296428024769, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 192.0, + "completions/mean_terminated_length": 192.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4136354327201843, + "epoch": 0.9411764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8874129279894816, + "kl": 0.029054976999759674, + "learning_rate": 8.684581371215904e-07, + "loss": -0.0202, + "num_tokens": 24343789.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3191401958465576, + "sampling/importance_sampling_ratio/mean": 1.0002342462539673, + "sampling/importance_sampling_ratio/min": 0.474845290184021, + "sampling/sampling_logp_difference/max": 0.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.01565386727452278, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 194.34375, + "completions/mean_terminated_length": 194.34375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.36488449573516846, + "epoch": 0.9424019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7283355799647103, + "kl": 0.03226052224636078, + "learning_rate": 8.679762062923175e-07, + "loss": 0.034, + "num_tokens": 24372499.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5199317932128906, + "sampling/importance_sampling_ratio/mean": 1.0011816024780273, + "sampling/importance_sampling_ratio/min": 0.6558409929275513, + "sampling/sampling_logp_difference/max": 0.42183685302734375, + "sampling/sampling_logp_difference/mean": 0.015518213622272015, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 176.09375, + "completions/mean_terminated_length": 176.09375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.34749341011047363, + "epoch": 0.9436274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02971181417815214, + "kl": 0.02474549412727356, + "learning_rate": 8.674935284946576e-07, + "loss": 0.0002, + "num_tokens": 24396969.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.292677640914917, + "sampling/importance_sampling_ratio/mean": 0.9994853138923645, + "sampling/importance_sampling_ratio/min": 0.7128995060920715, + "sampling/sampling_logp_difference/max": 0.33841484785079956, + "sampling/sampling_logp_difference/mean": 0.015015466138720512, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 157.296875, + "completions/mean_terminated_length": 157.296875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.31486156582832336, + "epoch": 0.9448529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026017777114247995, + "kl": 0.027308259159326553, + "learning_rate": 8.670101047084162e-07, + "loss": 0.0003, + "num_tokens": 24422348.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9757723808288574, + "sampling/importance_sampling_ratio/mean": 0.9998023509979248, + "sampling/importance_sampling_ratio/min": 0.4160304367542267, + "sampling/sampling_logp_difference/max": 0.8769968152046204, + "sampling/sampling_logp_difference/mean": 0.014564323239028454, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 188.1875, + "completions/mean_terminated_length": 188.1875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.38905900716781616, + "epoch": 0.946078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9039089435812095, + "kl": 0.024624085053801537, + "learning_rate": 8.66525935914913e-07, + "loss": 0.0136, + "num_tokens": 24449128.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4844878911972046, + "sampling/importance_sampling_ratio/mean": 0.9995477795600891, + "sampling/importance_sampling_ratio/min": 0.6306890845298767, + "sampling/sampling_logp_difference/max": 0.46094226837158203, + "sampling/sampling_logp_difference/mean": 0.016147365793585777, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 185.03125, + "completions/mean_terminated_length": 185.03125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.340989351272583, + "epoch": 0.9473039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02769537574442187, + "kl": 0.02422124333679676, + "learning_rate": 8.660410230969804e-07, + "loss": 0.0002, + "num_tokens": 24477034.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4221079349517822, + "sampling/importance_sampling_ratio/mean": 0.9997546672821045, + "sampling/importance_sampling_ratio/min": 0.6175847053527832, + "sampling/sampling_logp_difference/max": 0.48193907737731934, + "sampling/sampling_logp_difference/mean": 0.01526421494781971, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 147.140625, + "completions/mean_terminated_length": 147.140625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.34744322299957275, + "epoch": 0.9485294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02571645606738767, + "kl": 0.02424619346857071, + "learning_rate": 8.655553672389599e-07, + "loss": 0.0002, + "num_tokens": 24502595.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6170932054519653, + "sampling/importance_sampling_ratio/mean": 1.0006287097930908, + "sampling/importance_sampling_ratio/min": 0.6371588706970215, + "sampling/sampling_logp_difference/max": 0.48063015937805176, + "sampling/sampling_logp_difference/mean": 0.01588124781847, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 168.5625, + "completions/mean_terminated_length": 168.5625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4033295214176178, + "epoch": 0.9497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026614461560211557, + "kl": 0.02627166360616684, + "learning_rate": 8.650689693267026e-07, + "loss": 0.0003, + "num_tokens": 24536119.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071913003921509, + "sampling/importance_sampling_ratio/mean": 0.9998084306716919, + "sampling/importance_sampling_ratio/min": 0.6212287545204163, + "sampling/sampling_logp_difference/max": 0.4760558605194092, + "sampling/sampling_logp_difference/mean": 0.01671476662158966, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 182.03125, + "completions/mean_terminated_length": 182.03125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3922005891799927, + "epoch": 0.9509803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1076741310683296, + "kl": 0.03362289443612099, + "learning_rate": 8.645818303475654e-07, + "loss": -0.0317, + "num_tokens": 24565225.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3488740921020508, + "sampling/importance_sampling_ratio/mean": 0.9998282194137573, + "sampling/importance_sampling_ratio/min": 0.6369741559028625, + "sampling/sampling_logp_difference/max": 0.45102620124816895, + "sampling/sampling_logp_difference/mean": 0.015504934825003147, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 137.28125, + "completions/mean_terminated_length": 137.28125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.33687686920166016, + "epoch": 0.9522058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03624148232675531, + "kl": 0.031308457255363464, + "learning_rate": 8.640939512904095e-07, + "loss": 0.0003, + "num_tokens": 24593835.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5006722211837769, + "sampling/importance_sampling_ratio/mean": 0.9992250204086304, + "sampling/importance_sampling_ratio/min": 0.7104525566101074, + "sampling/sampling_logp_difference/max": 0.4059131145477295, + "sampling/sampling_logp_difference/mean": 0.015481984242796898, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 161.328125, + "completions/mean_terminated_length": 161.328125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.36610496044158936, + "epoch": 0.9534313725490197, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02376816592999184, + "kl": 0.02393110655248165, + "learning_rate": 8.636053331455986e-07, + "loss": 0.0002, + "num_tokens": 24622064.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5744339227676392, + "sampling/importance_sampling_ratio/mean": 0.9991030097007751, + "sampling/importance_sampling_ratio/min": 0.6132981777191162, + "sampling/sampling_logp_difference/max": 0.4889039993286133, + "sampling/sampling_logp_difference/mean": 0.016807250678539276, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 176.859375, + "completions/mean_terminated_length": 176.859375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3858628273010254, + "epoch": 0.9546568627450981, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03668443118356197, + "kl": 0.0322355255484581, + "learning_rate": 8.631159769049964e-07, + "loss": 0.0003, + "num_tokens": 24654519.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999144673347473, + "sampling/importance_sampling_ratio/min": 0.5613397359848022, + "sampling/sampling_logp_difference/max": 0.8548152446746826, + "sampling/sampling_logp_difference/mean": 0.01737913489341736, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 190.203125, + "completions/mean_terminated_length": 190.203125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.36571139097213745, + "epoch": 0.9558823529411765, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9619406119505388, + "kl": 0.03215567767620087, + "learning_rate": 8.626258835619653e-07, + "loss": 0.0087, + "num_tokens": 24682900.0, + "reward": -0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4509612321853638, + "sampling/importance_sampling_ratio/mean": 0.99996018409729, + "sampling/importance_sampling_ratio/min": 0.5384482741355896, + "sampling/sampling_logp_difference/max": 0.6190638542175293, + "sampling/sampling_logp_difference/mean": 0.015748433768749237, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 221.984375, + "completions/mean_terminated_length": 221.984375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3734496831893921, + "epoch": 0.9571078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023910769326798156, + "kl": 0.024634193629026413, + "learning_rate": 8.621350541113636e-07, + "loss": 0.0002, + "num_tokens": 24715219.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3035078048706055, + "sampling/importance_sampling_ratio/mean": 0.9995801448822021, + "sampling/importance_sampling_ratio/min": 0.6926900148391724, + "sampling/sampling_logp_difference/max": 0.3671727180480957, + "sampling/sampling_logp_difference/mean": 0.014857929199934006, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 179.96875, + "completions/mean_terminated_length": 179.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.31734922528266907, + "epoch": 0.9583333333333334, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022605097620998472, + "kl": 0.024092108011245728, + "learning_rate": 8.616434895495439e-07, + "loss": 0.0002, + "num_tokens": 24740017.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4312864542007446, + "sampling/importance_sampling_ratio/mean": 1.0001903772354126, + "sampling/importance_sampling_ratio/min": 0.6522687077522278, + "sampling/sampling_logp_difference/max": 0.4272986650466919, + "sampling/sampling_logp_difference/mean": 0.014620056375861168, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 197.671875, + "completions/mean_terminated_length": 197.671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.368533730506897, + "epoch": 0.9595588235294118, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03031176642955676, + "kl": 0.02464437112212181, + "learning_rate": 8.611511908743514e-07, + "loss": 0.0002, + "num_tokens": 24767324.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6221281290054321, + "sampling/importance_sampling_ratio/mean": 1.0000519752502441, + "sampling/importance_sampling_ratio/min": 0.6308545470237732, + "sampling/sampling_logp_difference/max": 0.48373889923095703, + "sampling/sampling_logp_difference/mean": 0.015080577693879604, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 187.6875, + "completions/mean_terminated_length": 187.6875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3149415850639343, + "epoch": 0.9607843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022615322522914705, + "kl": 0.031320638954639435, + "learning_rate": 8.606581590851208e-07, + "loss": 0.0002, + "num_tokens": 24794376.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4683986902236938, + "sampling/importance_sampling_ratio/mean": 0.999013364315033, + "sampling/importance_sampling_ratio/min": 0.6115894913673401, + "sampling/sampling_logp_difference/max": 0.49169397354125977, + "sampling/sampling_logp_difference/mean": 0.01463400200009346, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 203.390625, + "completions/mean_terminated_length": 203.390625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4158928096294403, + "epoch": 0.9620098039215687, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017029810644211132, + "kl": 0.02237580344080925, + "learning_rate": 8.601643951826758e-07, + "loss": 0.0002, + "num_tokens": 24827121.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4754341840744019, + "sampling/importance_sampling_ratio/mean": 0.9997149705886841, + "sampling/importance_sampling_ratio/min": 0.63714200258255, + "sampling/sampling_logp_difference/max": 0.4507627487182617, + "sampling/sampling_logp_difference/mean": 0.015549298375844955, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 169.078125, + "completions/mean_terminated_length": 169.078125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3622353971004486, + "epoch": 0.9632352941176471, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02681246787418524, + "kl": 0.025942612439393997, + "learning_rate": 8.596699001693255e-07, + "loss": 0.0003, + "num_tokens": 24854198.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6397879123687744, + "sampling/importance_sampling_ratio/mean": 0.9995217323303223, + "sampling/importance_sampling_ratio/min": 0.6080277562141418, + "sampling/sampling_logp_difference/max": 0.49753475189208984, + "sampling/sampling_logp_difference/mean": 0.014673823490738869, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 229.984375, + "completions/mean_terminated_length": 229.984375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.39878255128860474, + "epoch": 0.9644607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7017346655063545, + "kl": 0.03290364891290665, + "learning_rate": 8.591746750488637e-07, + "loss": 0.0012, + "num_tokens": 24888629.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7453142404556274, + "sampling/importance_sampling_ratio/mean": 0.9994145631790161, + "sampling/importance_sampling_ratio/min": 0.42222246527671814, + "sampling/sampling_logp_difference/max": 0.8622229099273682, + "sampling/sampling_logp_difference/mean": 0.015562876127660275, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 186.09375, + "completions/mean_terminated_length": 186.09375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.36311158537864685, + "epoch": 0.9656862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025418103632889946, + "kl": 0.02377057820558548, + "learning_rate": 8.58678720826566e-07, + "loss": 0.0003, + "num_tokens": 24917659.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5889713764190674, + "sampling/importance_sampling_ratio/mean": 0.9998500347137451, + "sampling/importance_sampling_ratio/min": 0.6410223841667175, + "sampling/sampling_logp_difference/max": 0.4630868434906006, + "sampling/sampling_logp_difference/mean": 0.015514722093939781, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 195.5, + "completions/mean_terminated_length": 195.5, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2629473805427551, + "epoch": 0.9669117647058824, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016260778793600026, + "kl": 0.017105624079704285, + "learning_rate": 8.58182038509188e-07, + "loss": 0.0002, + "num_tokens": 24947947.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4748215675354004, + "sampling/importance_sampling_ratio/mean": 0.9995681047439575, + "sampling/importance_sampling_ratio/min": 0.346270352602005, + "sampling/sampling_logp_difference/max": 1.0605354309082031, + "sampling/sampling_logp_difference/mean": 0.011698717251420021, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 192.3125, + "completions/mean_terminated_length": 192.3125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3489529490470886, + "epoch": 0.9681372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04731401217274868, + "kl": 0.032521091401576996, + "learning_rate": 8.576846291049633e-07, + "loss": 0.0003, + "num_tokens": 24980255.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.652165174484253, + "sampling/importance_sampling_ratio/mean": 1.0006591081619263, + "sampling/importance_sampling_ratio/min": 0.643796443939209, + "sampling/sampling_logp_difference/max": 0.5020866394042969, + "sampling/sampling_logp_difference/mean": 0.01453758031129837, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 248.921875, + "completions/mean_terminated_length": 248.921875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.45151424407958984, + "epoch": 0.9693627450980392, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9283430316909879, + "kl": 0.04374338686466217, + "learning_rate": 8.571864936236015e-07, + "loss": 0.0154, + "num_tokens": 25010602.0, + "reward": 0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4305888414382935, + "sampling/importance_sampling_ratio/mean": 0.9994878768920898, + "sampling/importance_sampling_ratio/min": 0.5685389041900635, + "sampling/sampling_logp_difference/max": 0.5646854639053345, + "sampling/sampling_logp_difference/mean": 0.017240433022379875, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 183.34375, + "completions/mean_terminated_length": 183.34375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3219026029109955, + "epoch": 0.9705882352941176, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023093056049612728, + "kl": 0.022820979356765747, + "learning_rate": 8.56687633076286e-07, + "loss": 0.0002, + "num_tokens": 25038688.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000104904174805, + "sampling/importance_sampling_ratio/min": 0.43182510137557983, + "sampling/sampling_logp_difference/max": 0.839734673500061, + "sampling/sampling_logp_difference/mean": 0.01387720089405775, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 189.234375, + "completions/mean_terminated_length": 189.234375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.398162305355072, + "epoch": 0.9718137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02067089574625209, + "kl": 0.023449556902050972, + "learning_rate": 8.561880484756724e-07, + "loss": 0.0002, + "num_tokens": 25071135.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4607446193695068, + "sampling/importance_sampling_ratio/mean": 0.9997786283493042, + "sampling/importance_sampling_ratio/min": 0.6524137258529663, + "sampling/sampling_logp_difference/max": 0.4270763397216797, + "sampling/sampling_logp_difference/mean": 0.016811659559607506, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 182.90625, + "completions/mean_terminated_length": 182.90625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3328895568847656, + "epoch": 0.9730392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02490222856414553, + "kl": 0.022241603583097458, + "learning_rate": 8.556877408358854e-07, + "loss": 0.0002, + "num_tokens": 25098921.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.656008243560791, + "sampling/importance_sampling_ratio/mean": 0.9999021887779236, + "sampling/importance_sampling_ratio/min": 0.5575329065322876, + "sampling/sampling_logp_difference/max": 0.5842337608337402, + "sampling/sampling_logp_difference/mean": 0.014693650417029858, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 204.21875, + "completions/mean_terminated_length": 204.21875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4073524475097656, + "epoch": 0.9742647058823529, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7699289479757312, + "kl": 0.025464978069067, + "learning_rate": 8.551867111725182e-07, + "loss": -0.0142, + "num_tokens": 25128439.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6670647859573364, + "sampling/importance_sampling_ratio/mean": 0.9994465112686157, + "sampling/importance_sampling_ratio/min": 0.6182374358177185, + "sampling/sampling_logp_difference/max": 0.5110645294189453, + "sampling/sampling_logp_difference/mean": 0.014799746684730053, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 209.609375, + "completions/mean_terminated_length": 209.609375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.4414971172809601, + "epoch": 0.9754901960784313, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021774394806298312, + "kl": 0.02606060355901718, + "learning_rate": 8.546849605026288e-07, + "loss": 0.0002, + "num_tokens": 25162974.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.507165551185608, + "sampling/importance_sampling_ratio/mean": 0.9992042779922485, + "sampling/importance_sampling_ratio/min": 0.6603471040725708, + "sampling/sampling_logp_difference/max": 0.414989709854126, + "sampling/sampling_logp_difference/mean": 0.017267197370529175, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 211.375, + "completions/mean_terminated_length": 211.375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.43835189938545227, + "epoch": 0.9767156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021780071068811264, + "kl": 0.022342439740896225, + "learning_rate": 8.541824898447397e-07, + "loss": 0.0002, + "num_tokens": 25198278.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8651736974716187, + "sampling/importance_sampling_ratio/mean": 0.9994983077049255, + "sampling/importance_sampling_ratio/min": 0.6086403727531433, + "sampling/sampling_logp_difference/max": 0.6233541965484619, + "sampling/sampling_logp_difference/mean": 0.01614305004477501, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 204.578125, + "completions/mean_terminated_length": 204.578125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.34380316734313965, + "epoch": 0.9779411764705882, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9334916089133114, + "kl": 0.021110976114869118, + "learning_rate": 8.536793002188343e-07, + "loss": -0.0265, + "num_tokens": 25229195.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5910096168518066, + "sampling/importance_sampling_ratio/mean": 0.9997698664665222, + "sampling/importance_sampling_ratio/min": 0.5589753985404968, + "sampling/sampling_logp_difference/max": 0.5816497802734375, + "sampling/sampling_logp_difference/mean": 0.014641951769590378, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 190.953125, + "completions/mean_terminated_length": 190.953125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3930814266204834, + "epoch": 0.9791666666666666, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9708205281054662, + "kl": 0.024711620062589645, + "learning_rate": 8.531753926463556e-07, + "loss": -0.0429, + "num_tokens": 25266424.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.753625512123108, + "sampling/importance_sampling_ratio/mean": 1.001275658607483, + "sampling/importance_sampling_ratio/min": 0.4847486913204193, + "sampling/sampling_logp_difference/max": 0.7241246700286865, + "sampling/sampling_logp_difference/mean": 0.015727955847978592, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 246.328125, + "completions/mean_terminated_length": 246.328125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.35948020219802856, + "epoch": 0.9803921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8200327497323995, + "kl": 0.028046952560544014, + "learning_rate": 8.526707681502043e-07, + "loss": 0.0293, + "num_tokens": 25310893.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.656794548034668, + "sampling/importance_sampling_ratio/mean": 1.0002923011779785, + "sampling/importance_sampling_ratio/min": 0.6141847968101501, + "sampling/sampling_logp_difference/max": 0.5048847198486328, + "sampling/sampling_logp_difference/mean": 0.013518155552446842, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 204.375, + "completions/mean_terminated_length": 204.375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3920966386795044, + "epoch": 0.9816176470588235, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031294687145535204, + "kl": 0.025551265105605125, + "learning_rate": 8.521654277547361e-07, + "loss": 0.0003, + "num_tokens": 25344293.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.602399230003357, + "sampling/importance_sampling_ratio/mean": 0.9989451766014099, + "sampling/importance_sampling_ratio/min": 0.654786229133606, + "sampling/sampling_logp_difference/max": 0.47150206565856934, + "sampling/sampling_logp_difference/mean": 0.016796045005321503, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 199.421875, + "completions/mean_terminated_length": 199.421875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.37933820486068726, + "epoch": 0.9828431372549019, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021350614162237914, + "kl": 0.025682110339403152, + "learning_rate": 8.516593724857597e-07, + "loss": 0.0003, + "num_tokens": 25375216.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3848679065704346, + "sampling/importance_sampling_ratio/mean": 0.9996330738067627, + "sampling/importance_sampling_ratio/min": 0.6501674652099609, + "sampling/sampling_logp_difference/max": 0.4305253028869629, + "sampling/sampling_logp_difference/mean": 0.015885349363088608, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 204.28125, + "completions/mean_terminated_length": 204.28125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4139837622642517, + "epoch": 0.9840686274509803, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036269306023235776, + "kl": 0.02276715636253357, + "learning_rate": 8.511526033705356e-07, + "loss": 0.0002, + "num_tokens": 25406194.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6009104251861572, + "sampling/importance_sampling_ratio/mean": 0.9997035264968872, + "sampling/importance_sampling_ratio/min": 0.5364245176315308, + "sampling/sampling_logp_difference/max": 0.6228294372558594, + "sampling/sampling_logp_difference/mean": 0.016889125108718872, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 210.828125, + "completions/mean_terminated_length": 210.828125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.36640146374702454, + "epoch": 0.9852941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0232377342843665, + "kl": 0.023490116000175476, + "learning_rate": 8.506451214377728e-07, + "loss": 0.0002, + "num_tokens": 25436359.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999101758003235, + "sampling/importance_sampling_ratio/min": 0.5069254636764526, + "sampling/sampling_logp_difference/max": 0.7274184226989746, + "sampling/sampling_logp_difference/mean": 0.01511009968817234, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 160.4375, + "completions/mean_terminated_length": 160.4375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3254278898239136, + "epoch": 0.9865196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02277267755169316, + "kl": 0.025547131896018982, + "learning_rate": 8.501369277176274e-07, + "loss": 0.0003, + "num_tokens": 25469027.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5311439037322998, + "sampling/importance_sampling_ratio/mean": 0.99989914894104, + "sampling/importance_sampling_ratio/min": 0.6171427965164185, + "sampling/sampling_logp_difference/max": 0.4826548099517822, + "sampling/sampling_logp_difference/mean": 0.014904014766216278, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 189.921875, + "completions/mean_terminated_length": 189.921875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3373665511608124, + "epoch": 0.9877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030394630041370656, + "kl": 0.028624843806028366, + "learning_rate": 8.496280232417007e-07, + "loss": 0.0003, + "num_tokens": 25505374.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3983898162841797, + "sampling/importance_sampling_ratio/mean": 0.9999194145202637, + "sampling/importance_sampling_ratio/min": 0.6314087510108948, + "sampling/sampling_logp_difference/max": 0.45980191230773926, + "sampling/sampling_logp_difference/mean": 0.015270407311618328, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 199.1875, + "completions/mean_terminated_length": 199.1875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.3611469864845276, + "epoch": 0.9889705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024047604858430795, + "kl": 0.02229381911456585, + "learning_rate": 8.491184090430363e-07, + "loss": 0.0002, + "num_tokens": 25534474.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6014314889907837, + "sampling/importance_sampling_ratio/mean": 0.9993778467178345, + "sampling/importance_sampling_ratio/min": 0.721843421459198, + "sampling/sampling_logp_difference/max": 0.470897912979126, + "sampling/sampling_logp_difference/mean": 0.015921613201498985, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 252.5, + "completions/mean_terminated_length": 252.5, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.36096253991127014, + "epoch": 0.9901960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017139250348018616, + "kl": 0.01766836643218994, + "learning_rate": 8.48608086156119e-07, + "loss": 0.0002, + "num_tokens": 25570554.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7675029039382935, + "sampling/importance_sampling_ratio/mean": 1.000484824180603, + "sampling/importance_sampling_ratio/min": 0.5053761601448059, + "sampling/sampling_logp_difference/max": 0.6824522018432617, + "sampling/sampling_logp_difference/mean": 0.014950842596590519, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 195.21875, + "completions/mean_terminated_length": 195.21875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2919182777404785, + "epoch": 0.991421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02545194783559473, + "kl": 0.022783063352108, + "learning_rate": 8.480970556168717e-07, + "loss": 0.0002, + "num_tokens": 25595112.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4753773212432861, + "sampling/importance_sampling_ratio/mean": 0.9998204112052917, + "sampling/importance_sampling_ratio/min": 0.6529889702796936, + "sampling/sampling_logp_difference/max": 0.42619502544403076, + "sampling/sampling_logp_difference/mean": 0.014107544906437397, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 269.34375, + "completions/mean_terminated_length": 269.34375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.430225133895874, + "epoch": 0.9926470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7129722044749738, + "kl": 0.023319419473409653, + "learning_rate": 8.47585318462654e-07, + "loss": 0.023, + "num_tokens": 25632446.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4448902606964111, + "sampling/importance_sampling_ratio/mean": 0.9993377923965454, + "sampling/importance_sampling_ratio/min": 0.6666995882987976, + "sampling/sampling_logp_difference/max": 0.40541577339172363, + "sampling/sampling_logp_difference/mean": 0.01555697526782751, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 272.359375, + "completions/mean_terminated_length": 272.359375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.40380987524986267, + "epoch": 0.9938725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6766444865459755, + "kl": 0.019705766811966896, + "learning_rate": 8.470728757322603e-07, + "loss": 0.0152, + "num_tokens": 25671429.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0000799894332886, + "sampling/importance_sampling_ratio/min": 0.6306539177894592, + "sampling/sampling_logp_difference/max": 0.4609980583190918, + "sampling/sampling_logp_difference/mean": 0.014769963920116425, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 147.265625, + "completions/mean_terminated_length": 147.265625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.24114222824573517, + "epoch": 0.9950980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03595364233249824, + "kl": 0.02802179381251335, + "learning_rate": 8.465597284659163e-07, + "loss": 0.0003, + "num_tokens": 25694054.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.475376844406128, + "sampling/importance_sampling_ratio/mean": 0.9994142651557922, + "sampling/importance_sampling_ratio/min": 0.6254650950431824, + "sampling/sampling_logp_difference/max": 0.46925973892211914, + "sampling/sampling_logp_difference/mean": 0.01176031120121479, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 204.5, + "completions/mean_terminated_length": 204.5, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3158072233200073, + "epoch": 0.9963235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02249599983166481, + "kl": 0.023762090131640434, + "learning_rate": 8.460458777052788e-07, + "loss": 0.0002, + "num_tokens": 25724774.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998077750205994, + "sampling/importance_sampling_ratio/min": 0.6465713977813721, + "sampling/sampling_logp_difference/max": 1.22906494140625, + "sampling/sampling_logp_difference/mean": 0.014211948961019516, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 244.6875, + "completions/mean_terminated_length": 244.6875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.45422881841659546, + "epoch": 0.9975490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.078847620590917, + "kl": 0.03181261569261551, + "learning_rate": 8.455313244934324e-07, + "loss": 0.0243, + "num_tokens": 25760546.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.6600011587142944, + "sampling/importance_sampling_ratio/mean": 1.00004243850708, + "sampling/importance_sampling_ratio/min": 0.7450798749923706, + "sampling/sampling_logp_difference/max": 0.5068182945251465, + "sampling/sampling_logp_difference/mean": 0.015185045078396797, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 246.359375, + "completions/mean_terminated_length": 246.359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3757499158382416, + "epoch": 0.9987745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8332631864909773, + "kl": 0.026064181700348854, + "learning_rate": 8.450160698748871e-07, + "loss": -0.0242, + "num_tokens": 25792345.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5309606790542603, + "sampling/importance_sampling_ratio/mean": 0.9993084073066711, + "sampling/importance_sampling_ratio/min": 0.5552458167076111, + "sampling/sampling_logp_difference/max": 0.5883443355560303, + "sampling/sampling_logp_difference/mean": 0.01448429748415947, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 178.703125, + "completions/mean_terminated_length": 178.703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4138263463973999, + "epoch": 1.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024616208636678072, + "kl": 0.039309754967689514, + "learning_rate": 8.445001148955775e-07, + "loss": 0.0004, + "num_tokens": 25818646.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071574449539185, + "sampling/importance_sampling_ratio/mean": 1.0002918243408203, + "sampling/importance_sampling_ratio/min": 0.6660245656967163, + "sampling/sampling_logp_difference/max": 0.41022539138793945, + "sampling/sampling_logp_difference/mean": 0.01580060087144375, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 222.328125, + "completions/mean_terminated_length": 222.328125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.44479626417160034, + "epoch": 1.0012254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016823090218986002, + "kl": 0.021708671003580093, + "learning_rate": 8.439834606028593e-07, + "loss": 0.0002, + "num_tokens": 25852363.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4680591821670532, + "sampling/importance_sampling_ratio/mean": 0.9998198747634888, + "sampling/importance_sampling_ratio/min": 0.642697274684906, + "sampling/sampling_logp_difference/max": 0.4420814514160156, + "sampling/sampling_logp_difference/mean": 0.016688354313373566, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 237.9375, + "completions/mean_terminated_length": 237.9375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4154745936393738, + "epoch": 1.0024509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7987049514767518, + "kl": 0.03554980456829071, + "learning_rate": 8.434661080455082e-07, + "loss": 0.0063, + "num_tokens": 25886503.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.621817946434021, + "sampling/importance_sampling_ratio/mean": 1.0001156330108643, + "sampling/importance_sampling_ratio/min": 0.6282684803009033, + "sampling/sampling_logp_difference/max": 0.4835476875305176, + "sampling/sampling_logp_difference/mean": 0.014652641490101814, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 168.46875, + "completions/mean_terminated_length": 168.46875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.33490508794784546, + "epoch": 1.0036764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9270778190476674, + "kl": 0.03322033956646919, + "learning_rate": 8.42948058273717e-07, + "loss": 0.0077, + "num_tokens": 25910277.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.545121669769287, + "sampling/importance_sampling_ratio/mean": 1.0001842975616455, + "sampling/importance_sampling_ratio/min": 0.6543320417404175, + "sampling/sampling_logp_difference/max": 0.4351027011871338, + "sampling/sampling_logp_difference/mean": 0.01393546536564827, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3989688456058502, + "epoch": 1.0049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01897158266492704, + "kl": 0.025341054424643517, + "learning_rate": 8.424293123390938e-07, + "loss": 0.0002, + "num_tokens": 25940357.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6479039192199707, + "sampling/importance_sampling_ratio/mean": 1.0001342296600342, + "sampling/importance_sampling_ratio/min": 0.6151823997497559, + "sampling/sampling_logp_difference/max": 0.49950408935546875, + "sampling/sampling_logp_difference/mean": 0.01571480929851532, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 236.609375, + "completions/mean_terminated_length": 236.609375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.5057919025421143, + "epoch": 1.0061274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8573369416986352, + "kl": 0.032287076115608215, + "learning_rate": 8.4190987129466e-07, + "loss": -0.021, + "num_tokens": 25974188.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.634234070777893, + "sampling/importance_sampling_ratio/mean": 0.9997125864028931, + "sampling/importance_sampling_ratio/min": 0.6482210755348206, + "sampling/sampling_logp_difference/max": 0.49117422103881836, + "sampling/sampling_logp_difference/mean": 0.016607385128736496, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 228.359375, + "completions/mean_terminated_length": 228.359375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.4514944851398468, + "epoch": 1.0073529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8506365877176593, + "kl": 0.0204324908554554, + "learning_rate": 8.413897361948483e-07, + "loss": 0.0116, + "num_tokens": 26006323.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5485343933105469, + "sampling/importance_sampling_ratio/mean": 1.000069499015808, + "sampling/importance_sampling_ratio/min": 0.7095987200737, + "sampling/sampling_logp_difference/max": 0.4373089075088501, + "sampling/sampling_logp_difference/mean": 0.015193624421954155, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 240.65625, + "completions/mean_terminated_length": 240.65625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3850115239620209, + "epoch": 1.008578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01656511101086973, + "kl": 0.022681277245283127, + "learning_rate": 8.408689080954997e-07, + "loss": 0.0002, + "num_tokens": 26042717.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3688677549362183, + "sampling/importance_sampling_ratio/mean": 0.9996740818023682, + "sampling/importance_sampling_ratio/min": 0.19988328218460083, + "sampling/sampling_logp_difference/max": 1.610021710395813, + "sampling/sampling_logp_difference/mean": 0.014235937967896461, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 224.65625, + "completions/mean_terminated_length": 224.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3661452829837799, + "epoch": 1.0098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02098756637086174, + "kl": 0.028736956417560577, + "learning_rate": 8.403473880538625e-07, + "loss": 0.0003, + "num_tokens": 26077175.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4588247537612915, + "sampling/importance_sampling_ratio/mean": 0.9994629621505737, + "sampling/importance_sampling_ratio/min": 0.6147721409797668, + "sampling/sampling_logp_difference/max": 0.48650360107421875, + "sampling/sampling_logp_difference/mean": 0.01425854954868555, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4839499294757843, + "epoch": 1.0110294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0668102894492004, + "kl": 0.035447005182504654, + "learning_rate": 8.398251771285892e-07, + "loss": -0.0105, + "num_tokens": 26112863.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5414327383041382, + "sampling/importance_sampling_ratio/mean": 0.999983012676239, + "sampling/importance_sampling_ratio/min": 0.7225062847137451, + "sampling/sampling_logp_difference/max": 0.4327123165130615, + "sampling/sampling_logp_difference/mean": 0.017018748447299004, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 262.1875, + "completions/mean_terminated_length": 262.1875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.437021404504776, + "epoch": 1.0122549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.820264979193408, + "kl": 0.025450553745031357, + "learning_rate": 8.393022763797346e-07, + "loss": 0.0001, + "num_tokens": 26144827.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4006553888320923, + "sampling/importance_sampling_ratio/mean": 1.0000797510147095, + "sampling/importance_sampling_ratio/min": 0.6503985524177551, + "sampling/sampling_logp_difference/max": 0.430169939994812, + "sampling/sampling_logp_difference/mean": 0.015213541686534882, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 164.296875, + "completions/mean_terminated_length": 164.296875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3759983777999878, + "epoch": 1.0134803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7982833386554936, + "kl": 0.05652917921543121, + "learning_rate": 8.387786868687548e-07, + "loss": -0.0012, + "num_tokens": 26167262.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3700846433639526, + "sampling/importance_sampling_ratio/mean": 0.9994276762008667, + "sampling/importance_sampling_ratio/min": 0.7327985167503357, + "sampling/sampling_logp_difference/max": 0.31487250328063965, + "sampling/sampling_logp_difference/mean": 0.014417910017073154, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 287.203125, + "completions/mean_terminated_length": 287.203125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.5151238441467285, + "epoch": 1.0147058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0133548511461008, + "kl": 0.030357833951711655, + "learning_rate": 8.382544096585026e-07, + "loss": -0.0312, + "num_tokens": 26200907.0, + "reward": 0.1875, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.7406495809555054, + "sampling/importance_sampling_ratio/mean": 0.9997864365577698, + "sampling/importance_sampling_ratio/min": 0.5055920481681824, + "sampling/sampling_logp_difference/max": 0.6820251941680908, + "sampling/sampling_logp_difference/mean": 0.016376610845327377, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 219.203125, + "completions/mean_terminated_length": 219.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.44265711307525635, + "epoch": 1.0159313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018610019581102484, + "kl": 0.02551637962460518, + "learning_rate": 8.37729445813228e-07, + "loss": 0.0003, + "num_tokens": 26235112.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.465740442276001, + "sampling/importance_sampling_ratio/mean": 1.0003550052642822, + "sampling/importance_sampling_ratio/min": 0.6710967421531677, + "sampling/sampling_logp_difference/max": 0.3988419771194458, + "sampling/sampling_logp_difference/mean": 0.014949493110179901, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 235.140625, + "completions/mean_terminated_length": 235.140625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.42499053478240967, + "epoch": 1.017156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01943161888339381, + "kl": 0.025752726942300797, + "learning_rate": 8.372037963985741e-07, + "loss": 0.0002, + "num_tokens": 26273297.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3642884492874146, + "sampling/importance_sampling_ratio/mean": 0.9999311566352844, + "sampling/importance_sampling_ratio/min": 0.5483862161636353, + "sampling/sampling_logp_difference/max": 0.6007754802703857, + "sampling/sampling_logp_difference/mean": 0.015527062118053436, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 271.234375, + "completions/mean_terminated_length": 271.234375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3281579613685608, + "epoch": 1.0183823529411764, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8457087183973668, + "kl": 0.030670788139104843, + "learning_rate": 8.366774624815761e-07, + "loss": -0.103, + "num_tokens": 26311648.0, + "reward": 0.4375, + "reward_std": 0.5081988573074341, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.293837308883667, + "sampling/importance_sampling_ratio/mean": 1.000064730644226, + "sampling/importance_sampling_ratio/min": 0.6780655384063721, + "sampling/sampling_logp_difference/max": 0.3885113000869751, + "sampling/sampling_logp_difference/mean": 0.012746588326990604, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 182.1875, + "completions/mean_terminated_length": 182.1875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.38060179352760315, + "epoch": 1.0196078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1358667089034782, + "kl": 0.02932559885084629, + "learning_rate": 8.361504451306584e-07, + "loss": -0.0295, + "num_tokens": 26345548.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.2997032403945923, + "sampling/importance_sampling_ratio/mean": 0.9998878836631775, + "sampling/importance_sampling_ratio/min": 0.48163238167762756, + "sampling/sampling_logp_difference/max": 0.7305741310119629, + "sampling/sampling_logp_difference/mean": 0.014157561585307121, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 235.484375, + "completions/mean_terminated_length": 235.484375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.40709584951400757, + "epoch": 1.0208333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8908409729496476, + "kl": 0.028004394844174385, + "learning_rate": 8.356227454156328e-07, + "loss": 0.0039, + "num_tokens": 26376523.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4950038194656372, + "sampling/importance_sampling_ratio/mean": 0.9999670386314392, + "sampling/importance_sampling_ratio/min": 0.703667402267456, + "sampling/sampling_logp_difference/max": 0.40212881565093994, + "sampling/sampling_logp_difference/mean": 0.015612028539180756, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 217.65625, + "completions/mean_terminated_length": 217.65625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4033668339252472, + "epoch": 1.0220588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02725360252468977, + "kl": 0.03612522780895233, + "learning_rate": 8.350943644076964e-07, + "loss": 0.0004, + "num_tokens": 26406869.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4186195135116577, + "sampling/importance_sampling_ratio/mean": 0.9994950294494629, + "sampling/importance_sampling_ratio/min": 0.6725078225135803, + "sampling/sampling_logp_difference/max": 0.39674150943756104, + "sampling/sampling_logp_difference/mean": 0.014155510812997818, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 185.625, + "completions/mean_terminated_length": 185.625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3583762049674988, + "epoch": 1.0232843137254901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02149560434094024, + "kl": 0.028283949941396713, + "learning_rate": 8.34565303179429e-07, + "loss": 0.0003, + "num_tokens": 26434141.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3261923789978027, + "sampling/importance_sampling_ratio/mean": 1.0003596544265747, + "sampling/importance_sampling_ratio/min": 0.7813834547996521, + "sampling/sampling_logp_difference/max": 0.2823120355606079, + "sampling/sampling_logp_difference/mean": 0.013688826002180576, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 228.828125, + "completions/mean_terminated_length": 228.828125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.49970585107803345, + "epoch": 1.0245098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3938016731403131, + "kl": 0.08226551115512848, + "learning_rate": 8.340355628047917e-07, + "loss": -0.0592, + "num_tokens": 26467666.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4455267190933228, + "sampling/importance_sampling_ratio/mean": 1.000573992729187, + "sampling/importance_sampling_ratio/min": 0.6101624965667725, + "sampling/sampling_logp_difference/max": 0.4940299987792969, + "sampling/sampling_logp_difference/mean": 0.017299897968769073, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 221.09375, + "completions/mean_terminated_length": 221.09375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.5714606046676636, + "epoch": 1.025735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026558899090748923, + "kl": 0.04937228932976723, + "learning_rate": 8.335051443591234e-07, + "loss": 0.0005, + "num_tokens": 26500936.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4323515892028809, + "sampling/importance_sampling_ratio/mean": 1.0003485679626465, + "sampling/importance_sampling_ratio/min": 0.7121589183807373, + "sampling/sampling_logp_difference/max": 0.3593175411224365, + "sampling/sampling_logp_difference/mean": 0.01874801144003868, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 209.3125, + "completions/mean_terminated_length": 209.3125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3629034161567688, + "epoch": 1.0269607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016865839137336323, + "kl": 0.024588044732809067, + "learning_rate": 8.329740489191405e-07, + "loss": 0.0002, + "num_tokens": 26529948.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5092782974243164, + "sampling/importance_sampling_ratio/mean": 1.0002012252807617, + "sampling/importance_sampling_ratio/min": 0.6888720989227295, + "sampling/sampling_logp_difference/max": 0.41163158416748047, + "sampling/sampling_logp_difference/mean": 0.013675286434590816, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 269.453125, + "completions/mean_terminated_length": 269.453125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.5257227420806885, + "epoch": 1.0281862745098038, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.1266783710781607, + "kl": 0.06060445308685303, + "learning_rate": 8.324422775629327e-07, + "loss": -0.0037, + "num_tokens": 26569433.0, + "reward": 0.53125, + "reward_std": 0.519389271736145, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4926996231079102, + "sampling/importance_sampling_ratio/mean": 0.9997835755348206, + "sampling/importance_sampling_ratio/min": 0.6224827170372009, + "sampling/sampling_logp_difference/max": 0.4740394353866577, + "sampling/sampling_logp_difference/mean": 0.01740849018096924, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 159.71875, + "completions/mean_terminated_length": 159.71875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.39167678356170654, + "epoch": 1.0294117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02078355836308601, + "kl": 0.02990805357694626, + "learning_rate": 8.319098313699624e-07, + "loss": 0.0003, + "num_tokens": 26599079.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.368140459060669, + "sampling/importance_sampling_ratio/mean": 0.9998688101768494, + "sampling/importance_sampling_ratio/min": 0.6918001174926758, + "sampling/sampling_logp_difference/max": 0.36845827102661133, + "sampling/sampling_logp_difference/mean": 0.015625260770320892, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 221.078125, + "completions/mean_terminated_length": 221.078125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.44650688767433167, + "epoch": 1.0306372549019607, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8699635035829281, + "kl": 0.03556312993168831, + "learning_rate": 8.313767114210615e-07, + "loss": 0.0255, + "num_tokens": 26640492.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.722933292388916, + "sampling/importance_sampling_ratio/mean": 0.9996669888496399, + "sampling/importance_sampling_ratio/min": 0.44537386298179626, + "sampling/sampling_logp_difference/max": 0.8088412284851074, + "sampling/sampling_logp_difference/mean": 0.016315573826432228, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 169.125, + "completions/mean_terminated_length": 169.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3267580270767212, + "epoch": 1.031862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020172033230415685, + "kl": 0.031396303325891495, + "learning_rate": 8.308429187984298e-07, + "loss": 0.0003, + "num_tokens": 26666036.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.559213399887085, + "sampling/importance_sampling_ratio/mean": 1.000575304031372, + "sampling/importance_sampling_ratio/min": 0.6239210367202759, + "sampling/sampling_logp_difference/max": 0.47173142433166504, + "sampling/sampling_logp_difference/mean": 0.013954175636172295, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 230.296875, + "completions/mean_terminated_length": 230.296875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.46382033824920654, + "epoch": 1.0330882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8635833195747631, + "kl": 0.02818339504301548, + "learning_rate": 8.303084545856322e-07, + "loss": 0.0231, + "num_tokens": 26706951.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.2970181703567505, + "sampling/importance_sampling_ratio/mean": 0.999815046787262, + "sampling/importance_sampling_ratio/min": 0.7092636227607727, + "sampling/sampling_logp_difference/max": 0.34352803230285645, + "sampling/sampling_logp_difference/mean": 0.014819949865341187, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 186.1875, + "completions/mean_terminated_length": 186.1875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.44449472427368164, + "epoch": 1.0343137254901962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0177674076373009, + "kl": 0.02924432046711445, + "learning_rate": 8.297733198675977e-07, + "loss": 0.0003, + "num_tokens": 26740691.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.601992130279541, + "sampling/importance_sampling_ratio/mean": 1.0001962184906006, + "sampling/importance_sampling_ratio/min": 0.7089138031005859, + "sampling/sampling_logp_difference/max": 0.47124791145324707, + "sampling/sampling_logp_difference/mean": 0.01642962172627449, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 181.34375, + "completions/mean_terminated_length": 181.34375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.5077136754989624, + "epoch": 1.0355392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019854642900428564, + "kl": 0.031858205795288086, + "learning_rate": 8.292375157306155e-07, + "loss": 0.0003, + "num_tokens": 26772233.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3731887340545654, + "sampling/importance_sampling_ratio/mean": 1.000213623046875, + "sampling/importance_sampling_ratio/min": 0.6937328577041626, + "sampling/sampling_logp_difference/max": 0.36566829681396484, + "sampling/sampling_logp_difference/mean": 0.017039429396390915, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 180.1875, + "completions/mean_terminated_length": 180.1875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.41495281457901, + "epoch": 1.036764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026011467757728685, + "kl": 0.058533675968647, + "learning_rate": 8.287010432623343e-07, + "loss": 0.0005, + "num_tokens": 26799365.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7040212154388428, + "sampling/importance_sampling_ratio/mean": 0.9999061226844788, + "sampling/importance_sampling_ratio/min": 0.617724597454071, + "sampling/sampling_logp_difference/max": 0.5329909324645996, + "sampling/sampling_logp_difference/mean": 0.016645336523652077, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 168.1875, + "completions/mean_terminated_length": 168.1875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.4212398827075958, + "epoch": 1.0379901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8988033018816445, + "kl": 0.0370466485619545, + "learning_rate": 8.281639035517591e-07, + "loss": -0.0099, + "num_tokens": 26824513.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.515293836593628, + "sampling/importance_sampling_ratio/mean": 0.9997961521148682, + "sampling/importance_sampling_ratio/min": 0.7043558955192566, + "sampling/sampling_logp_difference/max": 0.41560935974121094, + "sampling/sampling_logp_difference/mean": 0.016113050282001495, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 171.34375, + "completions/mean_terminated_length": 171.34375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.37584012746810913, + "epoch": 1.0392156862745099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04717020021178661, + "kl": 0.03277963399887085, + "learning_rate": 8.276260976892495e-07, + "loss": 0.0003, + "num_tokens": 26858087.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3748070001602173, + "sampling/importance_sampling_ratio/mean": 1.0005311965942383, + "sampling/importance_sampling_ratio/min": 0.6475529074668884, + "sampling/sampling_logp_difference/max": 0.4345548152923584, + "sampling/sampling_logp_difference/mean": 0.016502805054187775, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 195.5625, + "completions/mean_terminated_length": 195.5625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.47122788429260254, + "epoch": 1.0404411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018437721552177645, + "kl": 0.028479089960455894, + "learning_rate": 8.270876267665173e-07, + "loss": 0.0003, + "num_tokens": 26891211.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4462474584579468, + "sampling/importance_sampling_ratio/mean": 1.000338077545166, + "sampling/importance_sampling_ratio/min": 0.703247606754303, + "sampling/sampling_logp_difference/max": 0.3689723014831543, + "sampling/sampling_logp_difference/mean": 0.015550898388028145, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 173.4375, + "completions/mean_terminated_length": 173.4375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3868410587310791, + "epoch": 1.0416666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7911738619881935, + "kl": 0.03665817528963089, + "learning_rate": 8.265484918766242e-07, + "loss": 0.0231, + "num_tokens": 26915495.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.388287901878357, + "sampling/importance_sampling_ratio/mean": 0.9996983408927917, + "sampling/importance_sampling_ratio/min": 0.7110201120376587, + "sampling/sampling_logp_difference/max": 0.34105461835861206, + "sampling/sampling_logp_difference/mean": 0.014347558841109276, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 162.546875, + "completions/mean_terminated_length": 162.546875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.45995205640792847, + "epoch": 1.0428921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0248271739019652, + "kl": 0.03669149428606033, + "learning_rate": 8.260086941139804e-07, + "loss": 0.0013, + "num_tokens": 26949210.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5009289979934692, + "sampling/importance_sampling_ratio/mean": 0.9992746114730835, + "sampling/importance_sampling_ratio/min": 0.781379222869873, + "sampling/sampling_logp_difference/max": 0.4060842990875244, + "sampling/sampling_logp_difference/mean": 0.016376223415136337, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 195.765625, + "completions/mean_terminated_length": 195.765625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.5225400924682617, + "epoch": 1.0441176470588236, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.351549946282612, + "kl": 0.059774432331323624, + "learning_rate": 8.254682345743405e-07, + "loss": -0.0329, + "num_tokens": 26978331.0, + "reward": 0.65625, + "reward_std": 0.5539814233779907, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.2851217985153198, + "sampling/importance_sampling_ratio/mean": 0.999582052230835, + "sampling/importance_sampling_ratio/min": 0.6393632292747498, + "sampling/sampling_logp_difference/max": 0.4472825527191162, + "sampling/sampling_logp_difference/mean": 0.018038488924503326, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 227.421875, + "completions/mean_terminated_length": 227.421875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4862522482872009, + "epoch": 1.045343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6804139902347937, + "kl": 0.03782299533486366, + "learning_rate": 8.249271143548036e-07, + "loss": 0.0221, + "num_tokens": 27012870.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4067710638046265, + "sampling/importance_sampling_ratio/mean": 0.9999182820320129, + "sampling/importance_sampling_ratio/min": 0.6331161856651306, + "sampling/sampling_logp_difference/max": 0.45710134506225586, + "sampling/sampling_logp_difference/mean": 0.015550434589385986, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 196.390625, + "completions/mean_terminated_length": 196.390625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.45215368270874023, + "epoch": 1.0465686274509804, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4462448482716836, + "kl": 0.03857942670583725, + "learning_rate": 8.243853345538093e-07, + "loss": 0.0486, + "num_tokens": 27048175.0, + "reward": 0.6875, + "reward_std": 0.551956295967102, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6944257020950317, + "sampling/importance_sampling_ratio/mean": 0.9998030066490173, + "sampling/importance_sampling_ratio/min": 0.7130687832832336, + "sampling/sampling_logp_difference/max": 0.5273438692092896, + "sampling/sampling_logp_difference/mean": 0.01590794324874878, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 213.828125, + "completions/mean_terminated_length": 213.828125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3685460686683655, + "epoch": 1.0477941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8596830434152778, + "kl": 0.035765089094638824, + "learning_rate": 8.238428962711362e-07, + "loss": -0.004, + "num_tokens": 27079172.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5653468370437622, + "sampling/importance_sampling_ratio/mean": 0.9990967512130737, + "sampling/importance_sampling_ratio/min": 0.4404228627681732, + "sampling/sampling_logp_difference/max": 0.8200199604034424, + "sampling/sampling_logp_difference/mean": 0.013786762952804565, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 201.640625, + "completions/mean_terminated_length": 201.640625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4726966917514801, + "epoch": 1.0490196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04732635726139525, + "kl": 0.036044515669345856, + "learning_rate": 8.232998006078997e-07, + "loss": 0.0004, + "num_tokens": 27112253.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4486273527145386, + "sampling/importance_sampling_ratio/mean": 1.000450849533081, + "sampling/importance_sampling_ratio/min": 0.6108221411705017, + "sampling/sampling_logp_difference/max": 0.4929494857788086, + "sampling/sampling_logp_difference/mean": 0.015688953921198845, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 182.015625, + "completions/mean_terminated_length": 182.015625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3683471977710724, + "epoch": 1.0502450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020066894840116995, + "kl": 0.03780883550643921, + "learning_rate": 8.227560486665498e-07, + "loss": 0.0004, + "num_tokens": 27141054.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4763715267181396, + "sampling/importance_sampling_ratio/mean": 0.9994968771934509, + "sampling/importance_sampling_ratio/min": 0.7207610607147217, + "sampling/sampling_logp_difference/max": 0.38958740234375, + "sampling/sampling_logp_difference/mean": 0.013770157471299171, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 159.375, + "completions/mean_terminated_length": 159.375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.48345518112182617, + "epoch": 1.0514705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9422495834300965, + "kl": 0.05197905749082565, + "learning_rate": 8.222116415508682e-07, + "loss": 0.0267, + "num_tokens": 27167206.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3234844207763672, + "sampling/importance_sampling_ratio/mean": 1.0000250339508057, + "sampling/importance_sampling_ratio/min": 0.6663122773170471, + "sampling/sampling_logp_difference/max": 0.40599679946899414, + "sampling/sampling_logp_difference/mean": 0.0172736719250679, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 189.859375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.40405958890914917, + "epoch": 1.0526960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02064729612060633, + "kl": 0.03284838795661926, + "learning_rate": 8.21666580365967e-07, + "loss": 0.0003, + "num_tokens": 27202957.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.472433090209961, + "sampling/importance_sampling_ratio/mean": 1.000199317932129, + "sampling/importance_sampling_ratio/min": 0.6393764019012451, + "sampling/sampling_logp_difference/max": 0.4472620487213135, + "sampling/sampling_logp_difference/mean": 0.014828795567154884, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 211.03125, + "completions/mean_terminated_length": 211.03125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.5035507678985596, + "epoch": 1.053921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021765292983725287, + "kl": 0.052284855395555496, + "learning_rate": 8.211208662182858e-07, + "loss": 0.0004, + "num_tokens": 27237583.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002892017364502, + "sampling/importance_sampling_ratio/min": 0.7212308645248413, + "sampling/sampling_logp_difference/max": 0.8912079334259033, + "sampling/sampling_logp_difference/mean": 0.016904963180422783, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 173.359375, + "completions/mean_terminated_length": 173.359375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.5036715269088745, + "epoch": 1.0551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.183793972805832, + "kl": 0.05257159471511841, + "learning_rate": 8.205745002155899e-07, + "loss": -0.1866, + "num_tokens": 27267126.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.383525013923645, + "sampling/importance_sampling_ratio/mean": 0.9996995329856873, + "sampling/importance_sampling_ratio/min": 0.6777058839797974, + "sampling/sampling_logp_difference/max": 0.3890419006347656, + "sampling/sampling_logp_difference/mean": 0.01687629148364067, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 188.578125, + "completions/mean_terminated_length": 188.578125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4505173861980438, + "epoch": 1.0563725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02493041736236707, + "kl": 0.04182557761669159, + "learning_rate": 8.200274834669675e-07, + "loss": 0.0004, + "num_tokens": 27294459.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4025495052337646, + "sampling/importance_sampling_ratio/mean": 0.9999800324440002, + "sampling/importance_sampling_ratio/min": 0.6474964022636414, + "sampling/sampling_logp_difference/max": 0.43464207649230957, + "sampling/sampling_logp_difference/mean": 0.015560134314000607, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 188.125, + "completions/mean_terminated_length": 188.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.4979953169822693, + "epoch": 1.0575980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03040486590745786, + "kl": 0.050923287868499756, + "learning_rate": 8.194798170828279e-07, + "loss": 0.0005, + "num_tokens": 27326403.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6092044115066528, + "sampling/importance_sampling_ratio/mean": 0.999182939529419, + "sampling/importance_sampling_ratio/min": 0.7245404720306396, + "sampling/sampling_logp_difference/max": 0.4757399559020996, + "sampling/sampling_logp_difference/mean": 0.016028741374611855, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 177.40625, + "completions/mean_terminated_length": 177.40625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4137076735496521, + "epoch": 1.0588235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02850708317913002, + "kl": 0.03606805577874184, + "learning_rate": 8.189315021748993e-07, + "loss": 0.0004, + "num_tokens": 27354621.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5470267534255981, + "sampling/importance_sampling_ratio/mean": 1.0001707077026367, + "sampling/importance_sampling_ratio/min": 0.6292845606803894, + "sampling/sampling_logp_difference/max": 0.46317172050476074, + "sampling/sampling_logp_difference/mean": 0.0135587677359581, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 217.140625, + "completions/mean_terminated_length": 217.140625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.43804386258125305, + "epoch": 1.0600490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019411988475436792, + "kl": 0.03151471167802811, + "learning_rate": 8.183825398562263e-07, + "loss": 0.0003, + "num_tokens": 27386310.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3624889850616455, + "sampling/importance_sampling_ratio/mean": 0.9998658895492554, + "sampling/importance_sampling_ratio/min": 0.562029242515564, + "sampling/sampling_logp_difference/max": 0.5762014389038086, + "sampling/sampling_logp_difference/mean": 0.014450537040829659, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 139.828125, + "completions/mean_terminated_length": 139.828125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.5208714008331299, + "epoch": 1.0612745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03417919040874343, + "kl": 0.06288649886846542, + "learning_rate": 8.178329312411676e-07, + "loss": 0.0006, + "num_tokens": 27413643.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.364408016204834, + "sampling/importance_sampling_ratio/mean": 1.0002866983413696, + "sampling/importance_sampling_ratio/min": 0.6147370934486389, + "sampling/sampling_logp_difference/max": 0.486560583114624, + "sampling/sampling_logp_difference/mean": 0.017352543771266937, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 199.515625, + "completions/mean_terminated_length": 199.515625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4414290189743042, + "epoch": 1.0625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02473058072426531, + "kl": 0.04589700326323509, + "learning_rate": 8.172826774453936e-07, + "loss": 0.0004, + "num_tokens": 27438748.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3940861225128174, + "sampling/importance_sampling_ratio/mean": 1.0002847909927368, + "sampling/importance_sampling_ratio/min": 0.6868340969085693, + "sampling/sampling_logp_difference/max": 0.3756624460220337, + "sampling/sampling_logp_difference/mean": 0.014572693035006523, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 190.1875, + "completions/mean_terminated_length": 190.1875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.6011121273040771, + "epoch": 1.0637254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0148178383460087, + "kl": 0.051899440586566925, + "learning_rate": 8.16731779585885e-07, + "loss": -0.1049, + "num_tokens": 27475528.0, + "reward": -0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5088313817977905, + "sampling/importance_sampling_ratio/mean": 0.9997458457946777, + "sampling/importance_sampling_ratio/min": 0.7129045724868774, + "sampling/sampling_logp_difference/max": 0.41133546829223633, + "sampling/sampling_logp_difference/mean": 0.019103452563285828, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 203.078125, + "completions/mean_terminated_length": 203.078125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.5007017850875854, + "epoch": 1.0649509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02232365206301372, + "kl": 0.04034339264035225, + "learning_rate": 8.161802387809292e-07, + "loss": 0.0004, + "num_tokens": 27506813.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4619460105895996, + "sampling/importance_sampling_ratio/mean": 0.9996985793113708, + "sampling/importance_sampling_ratio/min": 0.6675713062286377, + "sampling/sampling_logp_difference/max": 0.40410900115966797, + "sampling/sampling_logp_difference/mean": 0.01666172593832016, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 255.515625, + "completions/mean_terminated_length": 255.515625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4940684139728546, + "epoch": 1.0661764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023715901489987402, + "kl": 0.03826040029525757, + "learning_rate": 8.156280561501194e-07, + "loss": 0.0004, + "num_tokens": 27546030.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5691092014312744, + "sampling/importance_sampling_ratio/mean": 0.9998692870140076, + "sampling/importance_sampling_ratio/min": 0.6319237947463989, + "sampling/sampling_logp_difference/max": 0.4589865207672119, + "sampling/sampling_logp_difference/mean": 0.015382746234536171, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 206.1875, + "completions/mean_terminated_length": 206.1875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4447386860847473, + "epoch": 1.0674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021946825285994783, + "kl": 0.040563203394412994, + "learning_rate": 8.150752328143513e-07, + "loss": 0.0004, + "num_tokens": 27579738.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.34888756275177, + "sampling/importance_sampling_ratio/mean": 0.9999980926513672, + "sampling/importance_sampling_ratio/min": 0.5510970950126648, + "sampling/sampling_logp_difference/max": 0.5958442687988281, + "sampling/sampling_logp_difference/mean": 0.014817440882325172, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 250.890625, + "completions/mean_terminated_length": 250.890625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.5305104851722717, + "epoch": 1.0686274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02157410289594113, + "kl": 0.04023445397615433, + "learning_rate": 8.145217698958211e-07, + "loss": 0.0004, + "num_tokens": 27612931.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.401291012763977, + "sampling/importance_sampling_ratio/mean": 1.0001248121261597, + "sampling/importance_sampling_ratio/min": 0.6908958554267883, + "sampling/sampling_logp_difference/max": 0.3697662353515625, + "sampling/sampling_logp_difference/mean": 0.015897506847977638, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 192.578125, + "completions/mean_terminated_length": 192.578125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.452565997838974, + "epoch": 1.0698529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021513085202711633, + "kl": 0.04016255587339401, + "learning_rate": 8.139676685180236e-07, + "loss": 0.0004, + "num_tokens": 27641848.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5309851169586182, + "sampling/importance_sampling_ratio/mean": 0.9997951984405518, + "sampling/importance_sampling_ratio/min": 0.6802050471305847, + "sampling/sampling_logp_difference/max": 0.42591142654418945, + "sampling/sampling_logp_difference/mean": 0.013624733313918114, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 219.296875, + "completions/mean_terminated_length": 219.296875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.38810229301452637, + "epoch": 1.071078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01823538234640203, + "kl": 0.03198229894042015, + "learning_rate": 8.134129298057495e-07, + "loss": 0.0003, + "num_tokens": 27673723.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4551739692687988, + "sampling/importance_sampling_ratio/mean": 1.0002098083496094, + "sampling/importance_sampling_ratio/min": 0.6292175054550171, + "sampling/sampling_logp_difference/max": 0.46327829360961914, + "sampling/sampling_logp_difference/mean": 0.013081250712275505, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 199.984375, + "completions/mean_terminated_length": 199.984375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.47072750329971313, + "epoch": 1.0723039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027011691257969865, + "kl": 0.03940771520137787, + "learning_rate": 8.128575548850832e-07, + "loss": 0.0004, + "num_tokens": 27702010.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.458251714706421, + "sampling/importance_sampling_ratio/mean": 0.9996449947357178, + "sampling/importance_sampling_ratio/min": 0.6385555267333984, + "sampling/sampling_logp_difference/max": 0.4485466480255127, + "sampling/sampling_logp_difference/mean": 0.015245177783071995, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 192.6875, + "completions/mean_terminated_length": 192.6875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.5234087705612183, + "epoch": 1.0735294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023714850434828398, + "kl": 0.043784648180007935, + "learning_rate": 8.123015448834005e-07, + "loss": 0.0004, + "num_tokens": 27733526.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4594935178756714, + "sampling/importance_sampling_ratio/mean": 1.0002353191375732, + "sampling/importance_sampling_ratio/min": 0.6867315769195557, + "sampling/sampling_logp_difference/max": 0.37808942794799805, + "sampling/sampling_logp_difference/mean": 0.016243984922766685, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 231.734375, + "completions/mean_terminated_length": 231.734375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4006674587726593, + "epoch": 1.0747549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6290842036901704, + "kl": 0.051520030945539474, + "learning_rate": 8.117449009293668e-07, + "loss": 0.0082, + "num_tokens": 27763413.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.2746437788009644, + "sampling/importance_sampling_ratio/mean": 0.9999599456787109, + "sampling/importance_sampling_ratio/min": 0.7638634443283081, + "sampling/sampling_logp_difference/max": 0.2693662643432617, + "sampling/sampling_logp_difference/mean": 0.013918805867433548, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 215.78125, + "completions/mean_terminated_length": 215.78125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4207499325275421, + "epoch": 1.0759803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7575280902031539, + "kl": 0.044188033789396286, + "learning_rate": 8.111876241529337e-07, + "loss": -0.0273, + "num_tokens": 27794839.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6420798301696777, + "sampling/importance_sampling_ratio/mean": 1.0000677108764648, + "sampling/importance_sampling_ratio/min": 0.6742939352989197, + "sampling/sampling_logp_difference/max": 0.49596357345581055, + "sampling/sampling_logp_difference/mean": 0.014050956815481186, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 211.890625, + "completions/mean_terminated_length": 211.890625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.49590641260147095, + "epoch": 1.0772058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7160266551808369, + "kl": 0.059609800577163696, + "learning_rate": 8.106297156853379e-07, + "loss": 0.0037, + "num_tokens": 27824096.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5287641286849976, + "sampling/importance_sampling_ratio/mean": 1.0005055665969849, + "sampling/importance_sampling_ratio/min": 0.6962414383888245, + "sampling/sampling_logp_difference/max": 0.42445969581604004, + "sampling/sampling_logp_difference/mean": 0.01589365303516388, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 236.71875, + "completions/mean_terminated_length": 236.71875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.4957412779331207, + "epoch": 1.0784313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7651120974169411, + "kl": 0.039259374141693115, + "learning_rate": 8.100711766590982e-07, + "loss": -0.0129, + "num_tokens": 27857598.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4374808073043823, + "sampling/importance_sampling_ratio/mean": 1.0001131296157837, + "sampling/importance_sampling_ratio/min": 0.7092178463935852, + "sampling/sampling_logp_difference/max": 0.36289215087890625, + "sampling/sampling_logp_difference/mean": 0.015798121690750122, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 156.265625, + "completions/mean_terminated_length": 156.265625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.38593214750289917, + "epoch": 1.079656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027970369726454135, + "kl": 0.041295409202575684, + "learning_rate": 8.095120082080134e-07, + "loss": 0.0004, + "num_tokens": 27883247.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3590514659881592, + "sampling/importance_sampling_ratio/mean": 0.9997127056121826, + "sampling/importance_sampling_ratio/min": 0.6409491300582886, + "sampling/sampling_logp_difference/max": 0.4448051452636719, + "sampling/sampling_logp_difference/mean": 0.014706568792462349, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 213.640625, + "completions/mean_terminated_length": 213.640625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.5414191484451294, + "epoch": 1.0808823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025291530606162897, + "kl": 0.050102598965168, + "learning_rate": 8.089522114671602e-07, + "loss": 0.0005, + "num_tokens": 27918648.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4569766521453857, + "sampling/importance_sampling_ratio/mean": 1.0000700950622559, + "sampling/importance_sampling_ratio/min": 0.6865126490592957, + "sampling/sampling_logp_difference/max": 0.37636351585388184, + "sampling/sampling_logp_difference/mean": 0.01677125319838524, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 220.421875, + "completions/mean_terminated_length": 220.421875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.5107558965682983, + "epoch": 1.0821078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02311364768778726, + "kl": 0.03638807311654091, + "learning_rate": 8.083917875728905e-07, + "loss": 0.0004, + "num_tokens": 27952627.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.629607915878296, + "sampling/importance_sampling_ratio/mean": 0.9997633099555969, + "sampling/importance_sampling_ratio/min": 0.635168731212616, + "sampling/sampling_logp_difference/max": 0.4883394241333008, + "sampling/sampling_logp_difference/mean": 0.017013484612107277, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 211.5, + "completions/mean_terminated_length": 211.5, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4580460488796234, + "epoch": 1.0833333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7049973427062836, + "kl": 0.04482467472553253, + "learning_rate": 8.07830737662829e-07, + "loss": 0.0085, + "num_tokens": 27985251.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5376219749450684, + "sampling/importance_sampling_ratio/mean": 0.9997838139533997, + "sampling/importance_sampling_ratio/min": 0.6243430376052856, + "sampling/sampling_logp_difference/max": 0.471055269241333, + "sampling/sampling_logp_difference/mean": 0.014974001795053482, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 158.34375, + "completions/mean_terminated_length": 158.34375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.45414239168167114, + "epoch": 1.0845588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019143579287490053, + "kl": 0.03195573389530182, + "learning_rate": 8.072690628758721e-07, + "loss": 0.0003, + "num_tokens": 28013273.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.356048822402954, + "sampling/importance_sampling_ratio/mean": 1.0004680156707764, + "sampling/importance_sampling_ratio/min": 0.7771909832954407, + "sampling/sampling_logp_difference/max": 0.30457520484924316, + "sampling/sampling_logp_difference/mean": 0.015054703690111637, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 209.828125, + "completions/mean_terminated_length": 209.828125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.353268563747406, + "epoch": 1.0857843137254901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014608857233496962, + "kl": 0.02390231378376484, + "learning_rate": 8.067067643521833e-07, + "loss": 0.0002, + "num_tokens": 28043390.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5793465375900269, + "sampling/importance_sampling_ratio/mean": 1.000211477279663, + "sampling/importance_sampling_ratio/min": 0.695388674736023, + "sampling/sampling_logp_difference/max": 0.45701122283935547, + "sampling/sampling_logp_difference/mean": 0.012544278055429459, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 260.5625, + "completions/mean_terminated_length": 260.5625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.4316437244415283, + "epoch": 1.0870098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016694614006907024, + "kl": 0.028756748884916306, + "learning_rate": 8.061438432331934e-07, + "loss": 0.0003, + "num_tokens": 28079250.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001678466796875, + "sampling/importance_sampling_ratio/min": 0.6582571864128113, + "sampling/sampling_logp_difference/max": 0.8504958152770996, + "sampling/sampling_logp_difference/mean": 0.01411934569478035, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 222.15625, + "completions/mean_terminated_length": 222.15625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4280780553817749, + "epoch": 1.088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0223459707735028, + "kl": 0.036168213933706284, + "learning_rate": 8.055803006615965e-07, + "loss": -0.1477, + "num_tokens": 28108428.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.384476900100708, + "sampling/importance_sampling_ratio/mean": 0.9997614622116089, + "sampling/importance_sampling_ratio/min": 0.6444646120071411, + "sampling/sampling_logp_difference/max": 0.43933534622192383, + "sampling/sampling_logp_difference/mean": 0.014062181115150452, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 170.015625, + "completions/mean_terminated_length": 170.015625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.4003869891166687, + "epoch": 1.0894607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02230605781685738, + "kl": 0.03253406286239624, + "learning_rate": 8.050161377813485e-07, + "loss": 0.0003, + "num_tokens": 28138093.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4407479763031006, + "sampling/importance_sampling_ratio/mean": 0.9998146891593933, + "sampling/importance_sampling_ratio/min": 0.6737135648727417, + "sampling/sampling_logp_difference/max": 0.394950270652771, + "sampling/sampling_logp_difference/mean": 0.015155184082686901, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 182.046875, + "completions/mean_terminated_length": 182.046875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.43908727169036865, + "epoch": 1.0906862745098038, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06986324859247232, + "kl": 0.03759096935391426, + "learning_rate": 8.04451355737664e-07, + "loss": 0.0004, + "num_tokens": 28165760.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6099239587783813, + "sampling/importance_sampling_ratio/mean": 1.0006098747253418, + "sampling/importance_sampling_ratio/min": 0.12663482129573822, + "sampling/sampling_logp_difference/max": 2.0664477348327637, + "sampling/sampling_logp_difference/mean": 0.0157114639878273, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 198.65625, + "completions/mean_terminated_length": 198.65625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4717601537704468, + "epoch": 1.0919117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03376924190707671, + "kl": 0.04111418128013611, + "learning_rate": 8.03885955677015e-07, + "loss": 0.0004, + "num_tokens": 28200954.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5522520542144775, + "sampling/importance_sampling_ratio/mean": 1.0006616115570068, + "sampling/importance_sampling_ratio/min": 0.664037823677063, + "sampling/sampling_logp_difference/max": 0.43970680236816406, + "sampling/sampling_logp_difference/mean": 0.01586918905377388, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 233.34375, + "completions/mean_terminated_length": 233.34375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.4475059509277344, + "epoch": 1.093137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7565513334532075, + "kl": 0.04115685820579529, + "learning_rate": 8.033199387471276e-07, + "loss": 0.0124, + "num_tokens": 28245104.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.4491372108459473, + "sampling/importance_sampling_ratio/mean": 1.0002517700195312, + "sampling/importance_sampling_ratio/min": 0.5148535966873169, + "sampling/sampling_logp_difference/max": 0.6638727188110352, + "sampling/sampling_logp_difference/mean": 0.015666604042053223, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 226.484375, + "completions/mean_terminated_length": 226.484375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.35593605041503906, + "epoch": 1.094362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015531131966394135, + "kl": 0.030154259875416756, + "learning_rate": 8.027533060969806e-07, + "loss": 0.0003, + "num_tokens": 28280367.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3830482959747314, + "sampling/importance_sampling_ratio/mean": 0.9998922944068909, + "sampling/importance_sampling_ratio/min": 0.5699650645256042, + "sampling/sampling_logp_difference/max": 0.5621802806854248, + "sampling/sampling_logp_difference/mean": 0.012990564107894897, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 215.15625, + "completions/mean_terminated_length": 215.15625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.41725218296051025, + "epoch": 1.0955882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6636959082142803, + "kl": 0.02948814630508423, + "learning_rate": 8.021860588768021e-07, + "loss": -0.0097, + "num_tokens": 28309177.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4687278270721436, + "sampling/importance_sampling_ratio/mean": 1.0000178813934326, + "sampling/importance_sampling_ratio/min": 0.6812041997909546, + "sampling/sampling_logp_difference/max": 0.3843965530395508, + "sampling/sampling_logp_difference/mean": 0.014885883778333664, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 214.609375, + "completions/mean_terminated_length": 214.609375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.40551576018333435, + "epoch": 1.0968137254901962, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017015254100040154, + "kl": 0.02709297090768814, + "learning_rate": 8.016181982380681e-07, + "loss": 0.0003, + "num_tokens": 28340528.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.445603609085083, + "sampling/importance_sampling_ratio/mean": 1.000315546989441, + "sampling/importance_sampling_ratio/min": 0.6513819098472595, + "sampling/sampling_logp_difference/max": 0.4286590814590454, + "sampling/sampling_logp_difference/mean": 0.014110507443547249, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 141.015625, + "completions/mean_terminated_length": 141.015625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.36902138590812683, + "epoch": 1.0980392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021789493080759108, + "kl": 0.03306157514452934, + "learning_rate": 8.010497253335e-07, + "loss": 0.0003, + "num_tokens": 28365153.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3766553401947021, + "sampling/importance_sampling_ratio/mean": 1.0002944469451904, + "sampling/importance_sampling_ratio/min": 0.6906982064247131, + "sampling/sampling_logp_difference/max": 0.3700523376464844, + "sampling/sampling_logp_difference/mean": 0.0150392334908247, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 634.0, + "completions/max_terminated_length": 634.0, + "completions/mean_length": 220.5625, + "completions/mean_terminated_length": 220.5625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.41868674755096436, + "epoch": 1.099264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5352552104936306, + "kl": 0.028247395530343056, + "learning_rate": 8.004806413170612e-07, + "loss": -0.0639, + "num_tokens": 28396261.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6137233972549438, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.7313059568405151, + "sampling/sampling_logp_difference/max": 0.4785442352294922, + "sampling/sampling_logp_difference/mean": 0.014314599335193634, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 225.828125, + "completions/mean_terminated_length": 225.828125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4448871612548828, + "epoch": 1.1004901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01983500442272238, + "kl": 0.03638867661356926, + "learning_rate": 7.999109473439569e-07, + "loss": 0.0003, + "num_tokens": 28427466.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5072325468063354, + "sampling/importance_sampling_ratio/mean": 1.0005900859832764, + "sampling/importance_sampling_ratio/min": 0.6686456203460693, + "sampling/sampling_logp_difference/max": 0.4102752208709717, + "sampling/sampling_logp_difference/mean": 0.015608585439622402, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 209.109375, + "completions/mean_terminated_length": 209.109375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3570801615715027, + "epoch": 1.1017156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6537754568071588, + "kl": 0.039102356880903244, + "learning_rate": 7.993406445706292e-07, + "loss": 0.0209, + "num_tokens": 28459969.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.441141128540039, + "sampling/importance_sampling_ratio/mean": 0.9999960660934448, + "sampling/importance_sampling_ratio/min": 0.6254708170890808, + "sampling/sampling_logp_difference/max": 0.4692506790161133, + "sampling/sampling_logp_difference/mean": 0.013170319609344006, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 174.8125, + "completions/mean_terminated_length": 174.8125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3478234112262726, + "epoch": 1.1029411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016751932293840175, + "kl": 0.0287268478423357, + "learning_rate": 7.987697341547568e-07, + "loss": 0.0003, + "num_tokens": 28484805.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4843366146087646, + "sampling/importance_sampling_ratio/mean": 1.000046968460083, + "sampling/importance_sampling_ratio/min": 0.6069750785827637, + "sampling/sampling_logp_difference/max": 0.499267578125, + "sampling/sampling_logp_difference/mean": 0.01609545573592186, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 193.546875, + "completions/mean_terminated_length": 193.546875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.37323224544525146, + "epoch": 1.1041666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015952773364252184, + "kl": 0.030691295862197876, + "learning_rate": 7.981982172552517e-07, + "loss": 0.0003, + "num_tokens": 28515672.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.44472074508667, + "sampling/importance_sampling_ratio/mean": 1.0006976127624512, + "sampling/importance_sampling_ratio/min": 0.693408727645874, + "sampling/sampling_logp_difference/max": 0.3679161071777344, + "sampling/sampling_logp_difference/mean": 0.014546047896146774, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 167.53125, + "completions/mean_terminated_length": 167.53125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.32717472314834595, + "epoch": 1.1053921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02514619694392657, + "kl": 0.031751178205013275, + "learning_rate": 7.976260950322571e-07, + "loss": 0.0003, + "num_tokens": 28540826.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5320740938186646, + "sampling/importance_sampling_ratio/mean": 0.9992892146110535, + "sampling/importance_sampling_ratio/min": 0.4944298565387726, + "sampling/sampling_logp_difference/max": 0.7043499946594238, + "sampling/sampling_logp_difference/mean": 0.014897492714226246, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 222.8125, + "completions/mean_terminated_length": 222.8125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.45857465267181396, + "epoch": 1.1066176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8008073300805483, + "kl": 0.03539489582180977, + "learning_rate": 7.970533686471448e-07, + "loss": 0.0055, + "num_tokens": 28579134.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3872824907302856, + "sampling/importance_sampling_ratio/mean": 0.9999810457229614, + "sampling/importance_sampling_ratio/min": 0.6253830790519714, + "sampling/sampling_logp_difference/max": 0.469390869140625, + "sampling/sampling_logp_difference/mean": 0.015618794597685337, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 200.328125, + "completions/mean_terminated_length": 200.328125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.30752766132354736, + "epoch": 1.107843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01553239892624823, + "kl": 0.026053044945001602, + "learning_rate": 7.964800392625128e-07, + "loss": 0.0002, + "num_tokens": 28610195.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 0.9998340010643005, + "sampling/importance_sampling_ratio/min": 0.680486798286438, + "sampling/sampling_logp_difference/max": 0.42380309104919434, + "sampling/sampling_logp_difference/mean": 0.012396201491355896, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 174.0625, + "completions/mean_terminated_length": 174.0625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.323736310005188, + "epoch": 1.1090686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8197624338844457, + "kl": 0.03432891145348549, + "learning_rate": 7.959061080421838e-07, + "loss": -0.0107, + "num_tokens": 28639703.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6952557563781738, + "sampling/importance_sampling_ratio/mean": 0.9998811483383179, + "sampling/importance_sampling_ratio/min": 0.7749429941177368, + "sampling/sampling_logp_difference/max": 0.5278335809707642, + "sampling/sampling_logp_difference/mean": 0.0137240681797266, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 155.3125, + "completions/mean_terminated_length": 155.3125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.40583595633506775, + "epoch": 1.1102941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024540924093596973, + "kl": 0.03756669536232948, + "learning_rate": 7.953315761512017e-07, + "loss": 0.0004, + "num_tokens": 28666139.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4206631183624268, + "sampling/importance_sampling_ratio/mean": 0.9998972415924072, + "sampling/importance_sampling_ratio/min": 0.6482253074645996, + "sampling/sampling_logp_difference/max": 0.4335169792175293, + "sampling/sampling_logp_difference/mean": 0.0156668983399868, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 198.90625, + "completions/mean_terminated_length": 198.90625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.43321409821510315, + "epoch": 1.1115196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016655306830246297, + "kl": 0.028577543795108795, + "learning_rate": 7.947564447558299e-07, + "loss": 0.0003, + "num_tokens": 28694597.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.286690354347229, + "sampling/importance_sampling_ratio/mean": 1.0003197193145752, + "sampling/importance_sampling_ratio/min": 0.7250173687934875, + "sampling/sampling_logp_difference/max": 0.3215596675872803, + "sampling/sampling_logp_difference/mean": 0.015658333897590637, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 224.984375, + "completions/mean_terminated_length": 224.984375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.4762854278087616, + "epoch": 1.1127450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017923507835296153, + "kl": 0.025353986769914627, + "learning_rate": 7.941807150235485e-07, + "loss": 0.0003, + "num_tokens": 28730180.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.635606050491333, + "sampling/importance_sampling_ratio/mean": 1.000074863433838, + "sampling/importance_sampling_ratio/min": 0.7178359627723694, + "sampling/sampling_logp_difference/max": 0.49201345443725586, + "sampling/sampling_logp_difference/mean": 0.015965506434440613, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 246.34375, + "completions/mean_terminated_length": 246.34375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3268033564090729, + "epoch": 1.1139705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01619742569804815, + "kl": 0.02567506767809391, + "learning_rate": 7.936043881230525e-07, + "loss": 0.0002, + "num_tokens": 28764746.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4772764444351196, + "sampling/importance_sampling_ratio/mean": 1.0004796981811523, + "sampling/importance_sampling_ratio/min": 0.6623192429542542, + "sampling/sampling_logp_difference/max": 0.41200757026672363, + "sampling/sampling_logp_difference/mean": 0.012335414066910744, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 223.015625, + "completions/mean_terminated_length": 223.015625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.4025377333164215, + "epoch": 1.1151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01725664674190133, + "kl": 0.030949950218200684, + "learning_rate": 7.930274652242491e-07, + "loss": 0.0003, + "num_tokens": 28795531.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5744267702102661, + "sampling/importance_sampling_ratio/mean": 1.000022530555725, + "sampling/importance_sampling_ratio/min": 0.6672366857528687, + "sampling/sampling_logp_difference/max": 0.4538912773132324, + "sampling/sampling_logp_difference/mean": 0.015366973355412483, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 190.484375, + "completions/mean_terminated_length": 190.484375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3960995376110077, + "epoch": 1.116421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04813797084800073, + "kl": 0.04051578789949417, + "learning_rate": 7.924499474982551e-07, + "loss": 0.0004, + "num_tokens": 28832458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.566362738609314, + "sampling/importance_sampling_ratio/mean": 1.0004899501800537, + "sampling/importance_sampling_ratio/min": 0.6762630939483643, + "sampling/sampling_logp_difference/max": 0.44875621795654297, + "sampling/sampling_logp_difference/mean": 0.014583173207938671, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 195.78125, + "completions/mean_terminated_length": 195.78125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.47338631749153137, + "epoch": 1.1176470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023064169387888838, + "kl": 0.029132433235645294, + "learning_rate": 7.91871836117395e-07, + "loss": 0.0003, + "num_tokens": 28859196.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5592929124832153, + "sampling/importance_sampling_ratio/mean": 1.0006070137023926, + "sampling/importance_sampling_ratio/min": 0.7059069275856018, + "sampling/sampling_logp_difference/max": 0.4442324638366699, + "sampling/sampling_logp_difference/mean": 0.01768975704908371, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 189.4375, + "completions/mean_terminated_length": 189.4375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3763211667537689, + "epoch": 1.1188725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01928194689201758, + "kl": 0.02983258292078972, + "learning_rate": 7.91293132255198e-07, + "loss": 0.0003, + "num_tokens": 28892296.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999362230300903, + "sampling/importance_sampling_ratio/min": 0.6273563504219055, + "sampling/sampling_logp_difference/max": 0.8365778923034668, + "sampling/sampling_logp_difference/mean": 0.01438450999557972, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 189.3125, + "completions/mean_terminated_length": 189.3125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.363294780254364, + "epoch": 1.1200980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017272240877062427, + "kl": 0.02581132762134075, + "learning_rate": 7.907138370863967e-07, + "loss": 0.0003, + "num_tokens": 28921964.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6357020139694214, + "sampling/importance_sampling_ratio/mean": 1.0008468627929688, + "sampling/importance_sampling_ratio/min": 0.1908649355173111, + "sampling/sampling_logp_difference/max": 1.656189203262329, + "sampling/sampling_logp_difference/mean": 0.01467039342969656, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 213.046875, + "completions/mean_terminated_length": 213.046875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.42518073320388794, + "epoch": 1.1213235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01869874756915245, + "kl": 0.027808737009763718, + "learning_rate": 7.901339517869232e-07, + "loss": 0.0003, + "num_tokens": 28955839.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6528089046478271, + "sampling/importance_sampling_ratio/mean": 1.0004780292510986, + "sampling/importance_sampling_ratio/min": 0.7308378219604492, + "sampling/sampling_logp_difference/max": 0.5024762153625488, + "sampling/sampling_logp_difference/mean": 0.01490770187228918, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 162.5625, + "completions/mean_terminated_length": 162.5625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3367150127887726, + "epoch": 1.1225490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017023830337671672, + "kl": 0.027068888768553734, + "learning_rate": 7.895534775339083e-07, + "loss": 0.0003, + "num_tokens": 28986899.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3988438844680786, + "sampling/importance_sampling_ratio/mean": 0.9999134540557861, + "sampling/importance_sampling_ratio/min": 0.6238155961036682, + "sampling/sampling_logp_difference/max": 0.47190046310424805, + "sampling/sampling_logp_difference/mean": 0.013619141653180122, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 228.640625, + "completions/mean_terminated_length": 228.640625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3943815529346466, + "epoch": 1.1237745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02079319192360828, + "kl": 0.03533846512436867, + "learning_rate": 7.889724155056776e-07, + "loss": 0.0003, + "num_tokens": 29029340.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9286022186279297, + "sampling/importance_sampling_ratio/mean": 1.0006740093231201, + "sampling/importance_sampling_ratio/min": 0.5990237593650818, + "sampling/sampling_logp_difference/max": 0.6567955017089844, + "sampling/sampling_logp_difference/mean": 0.014496782794594765, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 271.28125, + "completions/mean_terminated_length": 271.28125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.4110572934150696, + "epoch": 1.125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022792274933252598, + "kl": 0.027118559926748276, + "learning_rate": 7.883907668817506e-07, + "loss": 0.0003, + "num_tokens": 29066958.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.438292384147644, + "sampling/importance_sampling_ratio/mean": 0.9999451637268066, + "sampling/importance_sampling_ratio/min": 0.7216097712516785, + "sampling/sampling_logp_difference/max": 0.36345648765563965, + "sampling/sampling_logp_difference/mean": 0.013454319909214973, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 261.46875, + "completions/mean_terminated_length": 261.46875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.38979044556617737, + "epoch": 1.1262254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02604291304095247, + "kl": 0.029376091435551643, + "learning_rate": 7.878085328428368e-07, + "loss": 0.0003, + "num_tokens": 29100028.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4420621395111084, + "sampling/importance_sampling_ratio/mean": 1.0001835823059082, + "sampling/importance_sampling_ratio/min": 0.6249132752418518, + "sampling/sampling_logp_difference/max": 0.4701423645019531, + "sampling/sampling_logp_difference/mean": 0.014051743783056736, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 191.90625, + "completions/mean_terminated_length": 191.90625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.41036784648895264, + "epoch": 1.1274509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016530115017810088, + "kl": 0.02435620129108429, + "learning_rate": 7.872257145708345e-07, + "loss": 0.0002, + "num_tokens": 29133910.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6928790807724, + "sampling/importance_sampling_ratio/mean": 0.9995551705360413, + "sampling/importance_sampling_ratio/min": 0.6134859919548035, + "sampling/sampling_logp_difference/max": 0.5264307260513306, + "sampling/sampling_logp_difference/mean": 0.0158233679831028, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 201.390625, + "completions/mean_terminated_length": 201.390625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.29508695006370544, + "epoch": 1.1286764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013986273065847703, + "kl": 0.018711119890213013, + "learning_rate": 7.86642313248828e-07, + "loss": 0.0002, + "num_tokens": 29162495.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.521493673324585, + "sampling/importance_sampling_ratio/mean": 0.999792754650116, + "sampling/importance_sampling_ratio/min": 0.6860750913619995, + "sampling/sampling_logp_difference/max": 0.4196925163269043, + "sampling/sampling_logp_difference/mean": 0.0119221406057477, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 299.90625, + "completions/mean_terminated_length": 299.90625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.42031940817832947, + "epoch": 1.1299019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9134031683097505, + "kl": 0.031263936311006546, + "learning_rate": 7.860583300610847e-07, + "loss": 0.0032, + "num_tokens": 29206041.0, + "reward": -0.28125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5278496742248535, + "sampling/importance_sampling_ratio/mean": 1.0002261400222778, + "sampling/importance_sampling_ratio/min": 0.5127838253974915, + "sampling/sampling_logp_difference/max": 0.6679009199142456, + "sampling/sampling_logp_difference/mean": 0.01367473229765892, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 213.765625, + "completions/mean_terminated_length": 213.765625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.45548272132873535, + "epoch": 1.1311274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01808893076897117, + "kl": 0.028725363314151764, + "learning_rate": 7.854737661930539e-07, + "loss": 0.0003, + "num_tokens": 29235082.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4327467679977417, + "sampling/importance_sampling_ratio/mean": 1.0003161430358887, + "sampling/importance_sampling_ratio/min": 0.6732383370399475, + "sampling/sampling_logp_difference/max": 0.39565587043762207, + "sampling/sampling_logp_difference/mean": 0.016030866652727127, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 219.390625, + "completions/mean_terminated_length": 219.390625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3964219391345978, + "epoch": 1.1323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013289578435529128, + "kl": 0.024509485810995102, + "learning_rate": 7.848886228313632e-07, + "loss": 0.0002, + "num_tokens": 29268851.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3204513788223267, + "sampling/importance_sampling_ratio/mean": 1.0001609325408936, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.4826626777648926, + "sampling/sampling_logp_difference/mean": 0.0140391755849123, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 172.796875, + "completions/mean_terminated_length": 172.796875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.2802438735961914, + "epoch": 1.133578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015751847974729585, + "kl": 0.02570202574133873, + "learning_rate": 7.843029011638162e-07, + "loss": 0.0002, + "num_tokens": 29293078.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4692705869674683, + "sampling/importance_sampling_ratio/mean": 1.00046968460083, + "sampling/importance_sampling_ratio/min": 0.6958165168762207, + "sampling/sampling_logp_difference/max": 0.3847661018371582, + "sampling/sampling_logp_difference/mean": 0.012318532913923264, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 197.328125, + "completions/mean_terminated_length": 197.328125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3475108742713928, + "epoch": 1.1348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024025209368769927, + "kl": 0.03196142613887787, + "learning_rate": 7.837166023793908e-07, + "loss": 0.0003, + "num_tokens": 29324795.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5470669269561768, + "sampling/importance_sampling_ratio/mean": 0.9998123049736023, + "sampling/importance_sampling_ratio/min": 0.752821683883667, + "sampling/sampling_logp_difference/max": 0.43636083602905273, + "sampling/sampling_logp_difference/mean": 0.01308157853782177, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 172.578125, + "completions/mean_terminated_length": 172.578125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3719062805175781, + "epoch": 1.1360294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026864161189479176, + "kl": 0.03267707675695419, + "learning_rate": 7.831297276682368e-07, + "loss": 0.0003, + "num_tokens": 29349952.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6383248567581177, + "sampling/importance_sampling_ratio/mean": 0.9998093843460083, + "sampling/importance_sampling_ratio/min": 0.6067643761634827, + "sampling/sampling_logp_difference/max": 0.4996147155761719, + "sampling/sampling_logp_difference/mean": 0.016318414360284805, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 214.140625, + "completions/mean_terminated_length": 214.140625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4109461307525635, + "epoch": 1.1372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023827732349712038, + "kl": 0.03133368492126465, + "learning_rate": 7.825422782216724e-07, + "loss": 0.0003, + "num_tokens": 29384041.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3573529720306396, + "sampling/importance_sampling_ratio/mean": 0.9996014833450317, + "sampling/importance_sampling_ratio/min": 0.65472412109375, + "sampling/sampling_logp_difference/max": 0.4235413074493408, + "sampling/sampling_logp_difference/mean": 0.014543693512678146, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 200.4375, + "completions/mean_terminated_length": 200.4375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4032849371433258, + "epoch": 1.1384803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015633853504003762, + "kl": 0.022977590560913086, + "learning_rate": 7.819542552321827e-07, + "loss": 0.0002, + "num_tokens": 29412405.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 1.0006990432739258, + "sampling/importance_sampling_ratio/min": 0.6202813386917114, + "sampling/sampling_logp_difference/max": 0.49199533462524414, + "sampling/sampling_logp_difference/mean": 0.014526978135108948, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 220.9375, + "completions/mean_terminated_length": 220.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4555209279060364, + "epoch": 1.1397058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018454166474342476, + "kl": 0.029399922117590904, + "learning_rate": 7.813656598934173e-07, + "loss": 0.0003, + "num_tokens": 29443825.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4754360914230347, + "sampling/importance_sampling_ratio/mean": 0.9996351003646851, + "sampling/importance_sampling_ratio/min": 0.6344634294509888, + "sampling/sampling_logp_difference/max": 0.45497560501098633, + "sampling/sampling_logp_difference/mean": 0.016958877444267273, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 249.84375, + "completions/mean_terminated_length": 249.84375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.34868285059928894, + "epoch": 1.1409313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013567642960694574, + "kl": 0.021005570888519287, + "learning_rate": 7.807764934001874e-07, + "loss": 0.0002, + "num_tokens": 29476423.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6121875047683716, + "sampling/importance_sampling_ratio/mean": 1.000044345855713, + "sampling/importance_sampling_ratio/min": 0.6066538691520691, + "sampling/sampling_logp_difference/max": 0.49979686737060547, + "sampling/sampling_logp_difference/mean": 0.013358568772673607, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 216.390625, + "completions/mean_terminated_length": 216.390625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4077160358428955, + "epoch": 1.142156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6618215372445466, + "kl": 0.03380703181028366, + "learning_rate": 7.801867569484634e-07, + "loss": 0.0233, + "num_tokens": 29512752.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4813718795776367, + "sampling/importance_sampling_ratio/mean": 0.9998239278793335, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4679851531982422, + "sampling/sampling_logp_difference/mean": 0.015345785766839981, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 227.796875, + "completions/mean_terminated_length": 227.796875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.42371150851249695, + "epoch": 1.1433823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016883806363186343, + "kl": 0.022962680086493492, + "learning_rate": 7.795964517353733e-07, + "loss": 0.0002, + "num_tokens": 29543059.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8718515634536743, + "sampling/importance_sampling_ratio/mean": 1.0001614093780518, + "sampling/importance_sampling_ratio/min": 0.7095545530319214, + "sampling/sampling_logp_difference/max": 0.6269280910491943, + "sampling/sampling_logp_difference/mean": 0.014873354695737362, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 182.375, + "completions/mean_terminated_length": 182.375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.38495272397994995, + "epoch": 1.1446078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020400968831353655, + "kl": 0.03413142263889313, + "learning_rate": 7.790055789591993e-07, + "loss": 0.0003, + "num_tokens": 29571067.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5023506879806519, + "sampling/importance_sampling_ratio/mean": 0.9996978044509888, + "sampling/importance_sampling_ratio/min": 0.664975106716156, + "sampling/sampling_logp_difference/max": 0.4080057144165039, + "sampling/sampling_logp_difference/mean": 0.015339599922299385, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 152.671875, + "completions/mean_terminated_length": 152.671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.39188700914382935, + "epoch": 1.1458333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0193314784301376, + "kl": 0.025547783821821213, + "learning_rate": 7.784141398193753e-07, + "loss": 0.0003, + "num_tokens": 29605158.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4144792556762695, + "sampling/importance_sampling_ratio/mean": 0.9997140765190125, + "sampling/importance_sampling_ratio/min": 0.6417416334152222, + "sampling/sampling_logp_difference/max": 0.4435694217681885, + "sampling/sampling_logp_difference/mean": 0.015097095631062984, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 257.625, + "completions/mean_terminated_length": 257.625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.38108593225479126, + "epoch": 1.1470588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012087511602747795, + "kl": 0.01746346801519394, + "learning_rate": 7.778221355164857e-07, + "loss": 0.0002, + "num_tokens": 29648670.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.751657247543335, + "sampling/importance_sampling_ratio/mean": 1.0000417232513428, + "sampling/importance_sampling_ratio/min": 0.6792500615119934, + "sampling/sampling_logp_difference/max": 0.5605623722076416, + "sampling/sampling_logp_difference/mean": 0.014899947680532932, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 208.90625, + "completions/mean_terminated_length": 208.90625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.37955600023269653, + "epoch": 1.1482843137254901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01647561168103897, + "kl": 0.02749289572238922, + "learning_rate": 7.772295672522614e-07, + "loss": 0.0003, + "num_tokens": 29680936.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6303905248641968, + "sampling/importance_sampling_ratio/mean": 1.0001499652862549, + "sampling/importance_sampling_ratio/min": 0.677166223526001, + "sampling/sampling_logp_difference/max": 0.48881959915161133, + "sampling/sampling_logp_difference/mean": 0.013886782340705395, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 213.3125, + "completions/mean_terminated_length": 213.3125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4735932946205139, + "epoch": 1.1495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7218453170992659, + "kl": 0.028180256485939026, + "learning_rate": 7.766364362295788e-07, + "loss": -0.013, + "num_tokens": 29713772.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.3298238515853882, + "sampling/importance_sampling_ratio/mean": 0.9997259974479675, + "sampling/importance_sampling_ratio/min": 0.6942225694656372, + "sampling/sampling_logp_difference/max": 0.3649625778198242, + "sampling/sampling_logp_difference/mean": 0.0170447938144207, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 225.078125, + "completions/mean_terminated_length": 225.078125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.40240204334259033, + "epoch": 1.150735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01867880778206093, + "kl": 0.024466482922434807, + "learning_rate": 7.760427436524559e-07, + "loss": 0.0002, + "num_tokens": 29746673.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5001312494277954, + "sampling/importance_sampling_ratio/mean": 0.9998698830604553, + "sampling/importance_sampling_ratio/min": 0.6142581701278687, + "sampling/sampling_logp_difference/max": 0.48733997344970703, + "sampling/sampling_logp_difference/mean": 0.014945314265787601, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 215.734375, + "completions/mean_terminated_length": 215.734375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4451993405818939, + "epoch": 1.1519607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01680554671806869, + "kl": 0.027862906455993652, + "learning_rate": 7.754484907260512e-07, + "loss": 0.0003, + "num_tokens": 29778192.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4360154867172241, + "sampling/importance_sampling_ratio/mean": 1.0001566410064697, + "sampling/importance_sampling_ratio/min": 0.6816403865814209, + "sampling/sampling_logp_difference/max": 0.3832530975341797, + "sampling/sampling_logp_difference/mean": 0.016417954117059708, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 256.078125, + "completions/mean_terminated_length": 256.078125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.3542017340660095, + "epoch": 1.153186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7621131140916966, + "kl": 0.021910440176725388, + "learning_rate": 7.748536786566606e-07, + "loss": 0.0041, + "num_tokens": 29813653.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5361313819885254, + "sampling/importance_sampling_ratio/mean": 0.9997000098228455, + "sampling/importance_sampling_ratio/min": 0.6090093851089478, + "sampling/sampling_logp_difference/max": 0.49592161178588867, + "sampling/sampling_logp_difference/mean": 0.012756800279021263, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 207.140625, + "completions/mean_terminated_length": 207.140625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3920484185218811, + "epoch": 1.1544117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020545587392545597, + "kl": 0.025344235822558403, + "learning_rate": 7.742583086517149e-07, + "loss": 0.0003, + "num_tokens": 29849198.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4313565492630005, + "sampling/importance_sampling_ratio/mean": 1.00014328956604, + "sampling/importance_sampling_ratio/min": 0.6174163818359375, + "sampling/sampling_logp_difference/max": 0.4822115898132324, + "sampling/sampling_logp_difference/mean": 0.014016522094607353, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 225.46875, + "completions/mean_terminated_length": 225.46875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.32754069566726685, + "epoch": 1.155637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017125159976420713, + "kl": 0.019937563687562943, + "learning_rate": 7.736623819197773e-07, + "loss": 0.0002, + "num_tokens": 29881660.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4777964353561401, + "sampling/importance_sampling_ratio/mean": 1.0002424716949463, + "sampling/importance_sampling_ratio/min": 0.6300315856933594, + "sampling/sampling_logp_difference/max": 0.46198534965515137, + "sampling/sampling_logp_difference/mean": 0.013355264440178871, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 197.59375, + "completions/mean_terminated_length": 197.59375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.27188652753829956, + "epoch": 1.156862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01673616429022431, + "kl": 0.01753295212984085, + "learning_rate": 7.730658996705415e-07, + "loss": 0.0002, + "num_tokens": 29913826.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.491621732711792, + "sampling/importance_sampling_ratio/mean": 1.0000965595245361, + "sampling/importance_sampling_ratio/min": 0.6122355461120605, + "sampling/sampling_logp_difference/max": 0.49063825607299805, + "sampling/sampling_logp_difference/mean": 0.012826650403439999, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 170.203125, + "completions/mean_terminated_length": 170.203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.39124971628189087, + "epoch": 1.1580882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9643865686120452, + "kl": 0.03476168215274811, + "learning_rate": 7.724688631148286e-07, + "loss": -0.0043, + "num_tokens": 29942831.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4753769636154175, + "sampling/importance_sampling_ratio/mean": 0.999715268611908, + "sampling/importance_sampling_ratio/min": 0.6182939410209656, + "sampling/sampling_logp_difference/max": 0.4807913303375244, + "sampling/sampling_logp_difference/mean": 0.014948323369026184, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 229.96875, + "completions/mean_terminated_length": 229.96875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.35787203907966614, + "epoch": 1.159313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015282129492059498, + "kl": 0.021442048251628876, + "learning_rate": 7.718712734645849e-07, + "loss": 0.0002, + "num_tokens": 29975725.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.632232904434204, + "sampling/importance_sampling_ratio/mean": 0.999840259552002, + "sampling/importance_sampling_ratio/min": 0.5060733556747437, + "sampling/sampling_logp_difference/max": 0.6810736656188965, + "sampling/sampling_logp_difference/mean": 0.013961701653897762, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 192.125, + "completions/mean_terminated_length": 192.125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.36832353472709656, + "epoch": 1.1605392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023454253214559635, + "kl": 0.03226148337125778, + "learning_rate": 7.712731319328797e-07, + "loss": 0.0003, + "num_tokens": 30005557.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3227823972702026, + "sampling/importance_sampling_ratio/mean": 1.000182867050171, + "sampling/importance_sampling_ratio/min": 0.6053187847137451, + "sampling/sampling_logp_difference/max": 0.502000093460083, + "sampling/sampling_logp_difference/mean": 0.014252797700464725, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 164.46875, + "completions/mean_terminated_length": 164.46875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3432360589504242, + "epoch": 1.161764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0163520986602648, + "kl": 0.021451342850923538, + "learning_rate": 7.706744397339022e-07, + "loss": 0.0002, + "num_tokens": 30031971.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4416695833206177, + "sampling/importance_sampling_ratio/mean": 0.9997134804725647, + "sampling/importance_sampling_ratio/min": 0.6160857677459717, + "sampling/sampling_logp_difference/max": 0.48436903953552246, + "sampling/sampling_logp_difference/mean": 0.015156615525484085, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 232.546875, + "completions/mean_terminated_length": 232.546875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.44087710976600647, + "epoch": 1.1629901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9197137471296929, + "kl": 0.026438506320118904, + "learning_rate": 7.700751980829601e-07, + "loss": 0.0183, + "num_tokens": 30065638.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.461745262145996, + "sampling/importance_sampling_ratio/mean": 0.9997392892837524, + "sampling/importance_sampling_ratio/min": 0.6299211382865906, + "sampling/sampling_logp_difference/max": 0.462160587310791, + "sampling/sampling_logp_difference/mean": 0.01626892387866974, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 171.578125, + "completions/mean_terminated_length": 171.578125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3260308504104614, + "epoch": 1.1642156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8716004942013297, + "kl": 0.023889085277915, + "learning_rate": 7.694754081964754e-07, + "loss": -0.0032, + "num_tokens": 30092187.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4729878902435303, + "sampling/importance_sampling_ratio/mean": 1.000300407409668, + "sampling/importance_sampling_ratio/min": 0.7207422852516174, + "sampling/sampling_logp_difference/max": 0.38729286193847656, + "sampling/sampling_logp_difference/mean": 0.013107338920235634, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 242.734375, + "completions/mean_terminated_length": 242.734375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.34090811014175415, + "epoch": 1.1654411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01171329826460024, + "kl": 0.019636783748865128, + "learning_rate": 7.688750712919839e-07, + "loss": 0.0002, + "num_tokens": 30129146.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4735112190246582, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.5814192891120911, + "sampling/sampling_logp_difference/max": 0.5422830581665039, + "sampling/sampling_logp_difference/mean": 0.012322664260864258, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 240.953125, + "completions/mean_terminated_length": 240.953125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.4532848298549652, + "epoch": 1.1666666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5620988096409886, + "kl": 0.028768297284841537, + "learning_rate": 7.682741885881314e-07, + "loss": 0.0026, + "num_tokens": 30162903.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4075736999511719, + "sampling/importance_sampling_ratio/mean": 1.0002433061599731, + "sampling/importance_sampling_ratio/min": 0.7008883357048035, + "sampling/sampling_logp_difference/max": 0.35540664196014404, + "sampling/sampling_logp_difference/mean": 0.015215197578072548, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 207.46875, + "completions/mean_terminated_length": 207.46875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4852984547615051, + "epoch": 1.1678921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01658368194230398, + "kl": 0.024968475103378296, + "learning_rate": 7.676727613046719e-07, + "loss": 0.0003, + "num_tokens": 30197557.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.39171302318573, + "sampling/importance_sampling_ratio/mean": 1.0001628398895264, + "sampling/importance_sampling_ratio/min": 0.6412505507469177, + "sampling/sampling_logp_difference/max": 0.4443349838256836, + "sampling/sampling_logp_difference/mean": 0.016897138208150864, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 191.328125, + "completions/mean_terminated_length": 191.328125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.42920494079589844, + "epoch": 1.1691176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8958474022836896, + "kl": 0.052344582974910736, + "learning_rate": 7.670707906624643e-07, + "loss": -0.0074, + "num_tokens": 30224538.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.574435830116272, + "sampling/importance_sampling_ratio/mean": 0.9998064637184143, + "sampling/importance_sampling_ratio/min": 0.4768851101398468, + "sampling/sampling_logp_difference/max": 0.7404797077178955, + "sampling/sampling_logp_difference/mean": 0.016219427809119225, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 198.796875, + "completions/mean_terminated_length": 198.796875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3773866891860962, + "epoch": 1.170343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02937723992809643, + "kl": 0.029355604201555252, + "learning_rate": 7.664682778834712e-07, + "loss": 0.0003, + "num_tokens": 30254845.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4508373737335205, + "sampling/importance_sampling_ratio/mean": 0.999887228012085, + "sampling/importance_sampling_ratio/min": 0.6804096102714539, + "sampling/sampling_logp_difference/max": 0.38506031036376953, + "sampling/sampling_logp_difference/mean": 0.015669919550418854, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 208.25, + "completions/mean_terminated_length": 208.25, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.45939186215400696, + "epoch": 1.1715686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023625100950576464, + "kl": 0.029001720249652863, + "learning_rate": 7.658652241907554e-07, + "loss": 0.0003, + "num_tokens": 30282477.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6038897037506104, + "sampling/importance_sampling_ratio/mean": 1.0003774166107178, + "sampling/importance_sampling_ratio/min": 0.6473898887634277, + "sampling/sampling_logp_difference/max": 0.47243165969848633, + "sampling/sampling_logp_difference/mean": 0.01826980710029602, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 198.71875, + "completions/mean_terminated_length": 198.71875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3485085666179657, + "epoch": 1.1727941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021714340953082334, + "kl": 0.02960689179599285, + "learning_rate": 7.652616308084774e-07, + "loss": 0.0003, + "num_tokens": 30314427.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.429936170578003, + "sampling/importance_sampling_ratio/mean": 0.9995099306106567, + "sampling/importance_sampling_ratio/min": 0.6600603461265564, + "sampling/sampling_logp_difference/max": 0.415424108505249, + "sampling/sampling_logp_difference/mean": 0.012969018891453743, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 200.59375, + "completions/mean_terminated_length": 200.59375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3420395255088806, + "epoch": 1.1740196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6920962449991449, + "kl": 0.03241267055273056, + "learning_rate": 7.646574989618937e-07, + "loss": -0.001, + "num_tokens": 30343281.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3937641382217407, + "sampling/importance_sampling_ratio/mean": 0.9999229907989502, + "sampling/importance_sampling_ratio/min": 0.6786736845970154, + "sampling/sampling_logp_difference/max": 0.3876148462295532, + "sampling/sampling_logp_difference/mean": 0.014236288145184517, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 171.96875, + "completions/mean_terminated_length": 171.96875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4044609069824219, + "epoch": 1.1752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02605573369834518, + "kl": 0.034083012491464615, + "learning_rate": 7.640528298773536e-07, + "loss": 0.0003, + "num_tokens": 30370479.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.402698278427124, + "sampling/importance_sampling_ratio/mean": 1.0000107288360596, + "sampling/importance_sampling_ratio/min": 0.671543300151825, + "sampling/sampling_logp_difference/max": 0.3981768488883972, + "sampling/sampling_logp_difference/mean": 0.016809877008199692, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 179.890625, + "completions/mean_terminated_length": 179.890625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.4167441725730896, + "epoch": 1.1764705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019579150616267162, + "kl": 0.03047661855816841, + "learning_rate": 7.634476247822972e-07, + "loss": 0.0003, + "num_tokens": 30398056.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6280049085617065, + "sampling/importance_sampling_ratio/mean": 0.9998691082000732, + "sampling/importance_sampling_ratio/min": 0.6427965760231018, + "sampling/sampling_logp_difference/max": 0.48735523223876953, + "sampling/sampling_logp_difference/mean": 0.01612289622426033, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 196.515625, + "completions/mean_terminated_length": 196.515625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.35048040747642517, + "epoch": 1.1776960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.677471277336816, + "kl": 0.02648363634943962, + "learning_rate": 7.628418849052523e-07, + "loss": -0.0071, + "num_tokens": 30426217.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6926275491714478, + "sampling/importance_sampling_ratio/mean": 1.0004546642303467, + "sampling/importance_sampling_ratio/min": 0.6103792786598206, + "sampling/sampling_logp_difference/max": 0.5262820720672607, + "sampling/sampling_logp_difference/mean": 0.013000758364796638, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 239.15625, + "completions/mean_terminated_length": 239.15625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.36057567596435547, + "epoch": 1.178921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020773236537636602, + "kl": 0.02622954733669758, + "learning_rate": 7.622356114758327e-07, + "loss": 0.0003, + "num_tokens": 30458483.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5686147212982178, + "sampling/importance_sampling_ratio/mean": 0.9998708963394165, + "sampling/importance_sampling_ratio/min": 0.6077016592025757, + "sampling/sampling_logp_difference/max": 0.49807119369506836, + "sampling/sampling_logp_difference/mean": 0.013564372435212135, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 175.8125, + "completions/mean_terminated_length": 175.8125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3835163116455078, + "epoch": 1.1801470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020059666231379888, + "kl": 0.031151343137025833, + "learning_rate": 7.616288057247349e-07, + "loss": 0.0003, + "num_tokens": 30488839.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5970481634140015, + "sampling/importance_sampling_ratio/mean": 1.0011179447174072, + "sampling/importance_sampling_ratio/min": 0.6336846351623535, + "sampling/sampling_logp_difference/max": 0.4681570529937744, + "sampling/sampling_logp_difference/mean": 0.01436849869787693, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 191.5625, + "completions/mean_terminated_length": 191.5625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.338411420583725, + "epoch": 1.1813725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014550082356668844, + "kl": 0.020898407325148582, + "learning_rate": 7.610214688837361e-07, + "loss": 0.0002, + "num_tokens": 30527851.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4076093435287476, + "sampling/importance_sampling_ratio/mean": 0.999811589717865, + "sampling/importance_sampling_ratio/min": 0.5281767845153809, + "sampling/sampling_logp_difference/max": 0.6383242607116699, + "sampling/sampling_logp_difference/mean": 0.012341796420514584, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 171.359375, + "completions/mean_terminated_length": 171.359375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.31215670704841614, + "epoch": 1.1825980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018372978036901275, + "kl": 0.024941347539424896, + "learning_rate": 7.604136021856916e-07, + "loss": 0.0002, + "num_tokens": 30555282.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071580410003662, + "sampling/importance_sampling_ratio/mean": 1.0002996921539307, + "sampling/importance_sampling_ratio/min": 0.6637460589408875, + "sampling/sampling_logp_difference/max": 0.4102257490158081, + "sampling/sampling_logp_difference/mean": 0.013817012310028076, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 176.75, + "completions/mean_terminated_length": 176.75, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3429717719554901, + "epoch": 1.1838235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017101053742247727, + "kl": 0.022458365187048912, + "learning_rate": 7.598052068645324e-07, + "loss": 0.0002, + "num_tokens": 30588722.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4652889966964722, + "sampling/importance_sampling_ratio/mean": 1.0002515316009521, + "sampling/importance_sampling_ratio/min": 0.6105897426605225, + "sampling/sampling_logp_difference/max": 0.4933300018310547, + "sampling/sampling_logp_difference/mean": 0.0130779342725873, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 172.75, + "completions/mean_terminated_length": 172.75, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.32716017961502075, + "epoch": 1.1850490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9011340308409649, + "kl": 0.02837306074798107, + "learning_rate": 7.591962841552626e-07, + "loss": -0.023, + "num_tokens": 30625202.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4999078512191772, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.6259011626243591, + "sampling/sampling_logp_difference/max": 0.4685628414154053, + "sampling/sampling_logp_difference/mean": 0.013338066637516022, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 217.34375, + "completions/mean_terminated_length": 217.34375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.39989498257637024, + "epoch": 1.1862745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7684875311643962, + "kl": 0.04626480117440224, + "learning_rate": 7.585868352939562e-07, + "loss": 0.0353, + "num_tokens": 30655992.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5753024816513062, + "sampling/importance_sampling_ratio/mean": 1.0001583099365234, + "sampling/importance_sampling_ratio/min": 0.7037184238433838, + "sampling/sampling_logp_difference/max": 0.45444726943969727, + "sampling/sampling_logp_difference/mean": 0.015304194763302803, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 217.25, + "completions/mean_terminated_length": 217.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4833433926105499, + "epoch": 1.1875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8896629453698057, + "kl": 0.03638318181037903, + "learning_rate": 7.579768615177564e-07, + "loss": -0.0383, + "num_tokens": 30686696.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3544316291809082, + "sampling/importance_sampling_ratio/mean": 0.9995909929275513, + "sampling/importance_sampling_ratio/min": 0.6309517025947571, + "sampling/sampling_logp_difference/max": 0.4605259895324707, + "sampling/sampling_logp_difference/mean": 0.01705067604780197, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 309.75, + "completions/mean_terminated_length": 309.75, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.42330271005630493, + "epoch": 1.1887254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6170632654853302, + "kl": 0.03930829092860222, + "learning_rate": 7.57366364064871e-07, + "loss": -0.0055, + "num_tokens": 30726616.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5312823057174683, + "sampling/importance_sampling_ratio/mean": 0.9995028972625732, + "sampling/importance_sampling_ratio/min": 0.6076141595840454, + "sampling/sampling_logp_difference/max": 0.4982151985168457, + "sampling/sampling_logp_difference/mean": 0.014564193785190582, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 248.40625, + "completions/mean_terminated_length": 248.40625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.39436405897140503, + "epoch": 1.1899509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0215890274466495, + "kl": 0.03378557786345482, + "learning_rate": 7.567553441745711e-07, + "loss": -0.0112, + "num_tokens": 30765666.0, + "reward": 0.65625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6150052547454834, + "sampling/importance_sampling_ratio/mean": 0.9998675584793091, + "sampling/importance_sampling_ratio/min": 0.6254509687423706, + "sampling/sampling_logp_difference/max": 0.4793381690979004, + "sampling/sampling_logp_difference/mean": 0.01351084467023611, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 236.5, + "completions/mean_terminated_length": 236.5, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.4210416078567505, + "epoch": 1.1911764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020457549262549238, + "kl": 0.027543930336833, + "learning_rate": 7.561438030871885e-07, + "loss": 0.0003, + "num_tokens": 30797522.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5265755653381348, + "sampling/importance_sampling_ratio/mean": 1.0002741813659668, + "sampling/importance_sampling_ratio/min": 0.5889442563056946, + "sampling/sampling_logp_difference/max": 0.529423713684082, + "sampling/sampling_logp_difference/mean": 0.01547271478921175, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 285.625, + "completions/mean_terminated_length": 285.625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.36282581090927124, + "epoch": 1.1924019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8702373210575008, + "kl": 0.03914202004671097, + "learning_rate": 7.555317420441129e-07, + "loss": -0.0493, + "num_tokens": 30836442.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5971460342407227, + "sampling/importance_sampling_ratio/mean": 1.0002176761627197, + "sampling/importance_sampling_ratio/min": 0.6056472063064575, + "sampling/sampling_logp_difference/max": 0.501457691192627, + "sampling/sampling_logp_difference/mean": 0.011751336045563221, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 255.578125, + "completions/mean_terminated_length": 255.578125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3856160342693329, + "epoch": 1.1936274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026546802403762147, + "kl": 0.04335259646177292, + "learning_rate": 7.549191622877892e-07, + "loss": 0.0005, + "num_tokens": 30871407.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5312490463256836, + "sampling/importance_sampling_ratio/mean": 1.0002844333648682, + "sampling/importance_sampling_ratio/min": 0.6207950115203857, + "sampling/sampling_logp_difference/max": 0.47675442695617676, + "sampling/sampling_logp_difference/mean": 0.013014108873903751, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 264.703125, + "completions/mean_terminated_length": 264.703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.4230411648750305, + "epoch": 1.1948529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5991911776847293, + "kl": 0.026629038155078888, + "learning_rate": 7.543060650617158e-07, + "loss": 0.0116, + "num_tokens": 30906860.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5496594905853271, + "sampling/importance_sampling_ratio/mean": 0.9999898076057434, + "sampling/importance_sampling_ratio/min": 0.6419976949691772, + "sampling/sampling_logp_difference/max": 0.44317054748535156, + "sampling/sampling_logp_difference/mean": 0.013829650357365608, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 263.140625, + "completions/mean_terminated_length": 263.140625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.4759698212146759, + "epoch": 1.196078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8951274530401899, + "kl": 0.04232658073306084, + "learning_rate": 7.53692451610441e-07, + "loss": 0.0034, + "num_tokens": 30943461.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.505446195602417, + "sampling/importance_sampling_ratio/mean": 1.0001161098480225, + "sampling/importance_sampling_ratio/min": 0.6627125144004822, + "sampling/sampling_logp_difference/max": 0.4114140272140503, + "sampling/sampling_logp_difference/mean": 0.014992992393672466, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 290.609375, + "completions/mean_terminated_length": 290.609375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4588989019393921, + "epoch": 1.1973039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9283088244233804, + "kl": 0.03836783021688461, + "learning_rate": 7.530783231795614e-07, + "loss": 0.0558, + "num_tokens": 30979052.0, + "reward": 0.53125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3670469522476196, + "sampling/importance_sampling_ratio/mean": 0.9999618530273438, + "sampling/importance_sampling_ratio/min": 0.6349077224731445, + "sampling/sampling_logp_difference/max": 0.45427560806274414, + "sampling/sampling_logp_difference/mean": 0.01504638884216547, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 239.1875, + "completions/mean_terminated_length": 239.1875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.26992595195770264, + "epoch": 1.1985294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7095312256515067, + "kl": 0.022371649742126465, + "learning_rate": 7.524636810157188e-07, + "loss": -0.0168, + "num_tokens": 31012632.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5478365421295166, + "sampling/importance_sampling_ratio/mean": 1.0002949237823486, + "sampling/importance_sampling_ratio/min": 0.6262792944908142, + "sampling/sampling_logp_difference/max": 0.467958927154541, + "sampling/sampling_logp_difference/mean": 0.01214311271905899, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 187.21875, + "completions/mean_terminated_length": 187.21875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3832342028617859, + "epoch": 1.1997549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02012604263715308, + "kl": 0.036140792071819305, + "learning_rate": 7.518485263665977e-07, + "loss": 0.0004, + "num_tokens": 31041558.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.646314263343811, + "sampling/importance_sampling_ratio/mean": 1.0002691745758057, + "sampling/importance_sampling_ratio/min": 0.6513071656227112, + "sampling/sampling_logp_difference/max": 0.4985389709472656, + "sampling/sampling_logp_difference/mean": 0.014832520857453346, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 201.6875, + "completions/mean_terminated_length": 201.6875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.5023150444030762, + "epoch": 1.2009803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1706872148884344, + "kl": 0.050048939883708954, + "learning_rate": 7.512328604809232e-07, + "loss": 0.0054, + "num_tokens": 31069074.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.588779330253601, + "sampling/importance_sampling_ratio/mean": 0.9997115731239319, + "sampling/importance_sampling_ratio/min": 0.7012655138969421, + "sampling/sampling_logp_difference/max": 0.4629659652709961, + "sampling/sampling_logp_difference/mean": 0.017660582438111305, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 270.71875, + "completions/mean_terminated_length": 270.71875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.40667831897735596, + "epoch": 1.2022058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6439952382837856, + "kl": 0.027503911405801773, + "learning_rate": 7.506166846084579e-07, + "loss": 0.0027, + "num_tokens": 31104624.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.450819730758667, + "sampling/importance_sampling_ratio/mean": 0.999941349029541, + "sampling/importance_sampling_ratio/min": 0.6276686787605286, + "sampling/sampling_logp_difference/max": 0.465742826461792, + "sampling/sampling_logp_difference/mean": 0.014252717606723309, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 652.0, + "completions/max_terminated_length": 652.0, + "completions/mean_length": 256.140625, + "completions/mean_terminated_length": 256.140625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.41683530807495117, + "epoch": 1.2034313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.711185580258691, + "kl": 0.03274468705058098, + "learning_rate": 7.5e-07, + "loss": 0.0331, + "num_tokens": 31150649.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996276497840881, + "sampling/importance_sampling_ratio/min": 0.6253455281257629, + "sampling/sampling_logp_difference/max": 0.854128360748291, + "sampling/sampling_logp_difference/mean": 0.013821342028677464, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 175.765625, + "completions/mean_terminated_length": 175.765625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4364689588546753, + "epoch": 1.204656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8481335327568599, + "kl": 0.04834900051355362, + "learning_rate": 7.493828079073801e-07, + "loss": -0.0047, + "num_tokens": 31175114.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3553777933120728, + "sampling/importance_sampling_ratio/mean": 0.999114453792572, + "sampling/importance_sampling_ratio/min": 0.6089012026786804, + "sampling/sampling_logp_difference/max": 0.49609923362731934, + "sampling/sampling_logp_difference/mean": 0.01572294905781746, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 148.953125, + "completions/mean_terminated_length": 148.953125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3382428288459778, + "epoch": 1.2058823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02191739361832906, + "kl": 0.033510662615299225, + "learning_rate": 7.487651095834588e-07, + "loss": 0.0003, + "num_tokens": 31199111.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.318970799446106, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.7020292282104492, + "sampling/sampling_logp_difference/max": 0.35378026962280273, + "sampling/sampling_logp_difference/mean": 0.012856241315603256, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 248.484375, + "completions/mean_terminated_length": 248.484375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.4134531617164612, + "epoch": 1.2071078431372548, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9921352611677013, + "kl": 0.03773762285709381, + "learning_rate": 7.481469062821251e-07, + "loss": 0.018, + "num_tokens": 31232518.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.593570590019226, + "sampling/importance_sampling_ratio/mean": 1.0001540184020996, + "sampling/importance_sampling_ratio/min": 0.6361991763114929, + "sampling/sampling_logp_difference/max": 0.46597719192504883, + "sampling/sampling_logp_difference/mean": 0.014097131788730621, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 238.328125, + "completions/mean_terminated_length": 238.328125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4444141387939453, + "epoch": 1.2083333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7653636702913152, + "kl": 0.03863909840583801, + "learning_rate": 7.47528199258292e-07, + "loss": -0.0156, + "num_tokens": 31266283.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.300205945968628, + "sampling/importance_sampling_ratio/mean": 0.999968409538269, + "sampling/importance_sampling_ratio/min": 0.6790764331817627, + "sampling/sampling_logp_difference/max": 0.387021541595459, + "sampling/sampling_logp_difference/mean": 0.014662148430943489, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 180.59375, + "completions/mean_terminated_length": 180.59375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3819793164730072, + "epoch": 1.2095588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0203682283357998, + "kl": 0.03312782198190689, + "learning_rate": 7.469089897678957e-07, + "loss": 0.0003, + "num_tokens": 31291073.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 0.9995713829994202, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.014812866225838661, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 197.6875, + "completions/mean_terminated_length": 197.6875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.5025283694267273, + "epoch": 1.2107843137254901, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1871480858972723, + "kl": 0.08525514602661133, + "learning_rate": 7.462892790678925e-07, + "loss": 0.023, + "num_tokens": 31321357.0, + "reward": 0.0625, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5077842473983765, + "sampling/importance_sampling_ratio/mean": 0.9996609687805176, + "sampling/importance_sampling_ratio/min": 0.6306982636451721, + "sampling/sampling_logp_difference/max": 0.46092772483825684, + "sampling/sampling_logp_difference/mean": 0.015542639419436455, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 211.0625, + "completions/mean_terminated_length": 211.0625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.5481150150299072, + "epoch": 1.2120098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1934147915780475, + "kl": 0.043784111738204956, + "learning_rate": 7.456690684162556e-07, + "loss": 0.0195, + "num_tokens": 31349329.0, + "reward": 0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.484890341758728, + "sampling/importance_sampling_ratio/mean": 1.0009729862213135, + "sampling/importance_sampling_ratio/min": 0.6217290163040161, + "sampling/sampling_logp_difference/max": 0.4752509593963623, + "sampling/sampling_logp_difference/mean": 0.018647316843271255, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 193.734375, + "completions/mean_terminated_length": 193.734375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.5381791591644287, + "epoch": 1.213235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1130555604937038, + "kl": 0.052825599908828735, + "learning_rate": 7.450483590719736e-07, + "loss": -0.0192, + "num_tokens": 31390544.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.4463319778442383, + "sampling/importance_sampling_ratio/mean": 1.0000205039978027, + "sampling/importance_sampling_ratio/min": 0.6269499063491821, + "sampling/sampling_logp_difference/max": 0.4668886661529541, + "sampling/sampling_logp_difference/mean": 0.017740219831466675, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 196.671875, + "completions/mean_terminated_length": 196.671875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.5464645624160767, + "epoch": 1.2144607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02109620294982689, + "kl": 0.04515843093395233, + "learning_rate": 7.444271522950468e-07, + "loss": 0.0005, + "num_tokens": 31419883.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6023401021957397, + "sampling/importance_sampling_ratio/mean": 1.0001468658447266, + "sampling/importance_sampling_ratio/min": 0.662236213684082, + "sampling/sampling_logp_difference/max": 0.4714651107788086, + "sampling/sampling_logp_difference/mean": 0.016527537256479263, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 186.25, + "completions/mean_terminated_length": 186.25, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.41974663734436035, + "epoch": 1.215686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023489431271263374, + "kl": 0.033342257142066956, + "learning_rate": 7.438054493464859e-07, + "loss": 0.0003, + "num_tokens": 31452619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.549433708190918, + "sampling/importance_sampling_ratio/mean": 1.000077724456787, + "sampling/importance_sampling_ratio/min": 0.7335385084152222, + "sampling/sampling_logp_difference/max": 0.43788957595825195, + "sampling/sampling_logp_difference/mean": 0.01427594292908907, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 199.6875, + "completions/mean_terminated_length": 199.6875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4310314655303955, + "epoch": 1.2169117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017950679150781197, + "kl": 0.037778109312057495, + "learning_rate": 7.431832514883081e-07, + "loss": 0.0004, + "num_tokens": 31481591.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3894187211990356, + "sampling/importance_sampling_ratio/mean": 1.0000156164169312, + "sampling/importance_sampling_ratio/min": 0.6943917870521545, + "sampling/sampling_logp_difference/max": 0.3647189140319824, + "sampling/sampling_logp_difference/mean": 0.014455149881541729, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 180.515625, + "completions/mean_terminated_length": 180.515625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4299527406692505, + "epoch": 1.218137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8697049910147577, + "kl": 0.03601614385843277, + "learning_rate": 7.42560559983536e-07, + "loss": 0.0272, + "num_tokens": 31511480.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6359535455703735, + "sampling/importance_sampling_ratio/mean": 1.0001451969146729, + "sampling/importance_sampling_ratio/min": 0.33314183354377747, + "sampling/sampling_logp_difference/max": 1.099186897277832, + "sampling/sampling_logp_difference/mean": 0.014509855769574642, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 215.390625, + "completions/mean_terminated_length": 215.390625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.5677188634872437, + "epoch": 1.219362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02080626355372966, + "kl": 0.03738052397966385, + "learning_rate": 7.419373760961939e-07, + "loss": 0.0004, + "num_tokens": 31545489.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.425569772720337, + "sampling/importance_sampling_ratio/mean": 0.9999176263809204, + "sampling/importance_sampling_ratio/min": 0.6939416527748108, + "sampling/sampling_logp_difference/max": 0.36536741256713867, + "sampling/sampling_logp_difference/mean": 0.017984483391046524, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 182.09375, + "completions/mean_terminated_length": 182.09375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.44296029210090637, + "epoch": 1.2205882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8011630456171402, + "kl": 0.04674481600522995, + "learning_rate": 7.413137010913054e-07, + "loss": 0.0142, + "num_tokens": 31572871.0, + "reward": -0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.8862242698669434, + "sampling/importance_sampling_ratio/mean": 1.0002238750457764, + "sampling/importance_sampling_ratio/min": 0.6639931797981262, + "sampling/sampling_logp_difference/max": 0.6345770359039307, + "sampling/sampling_logp_difference/mean": 0.015609879046678543, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 213.65625, + "completions/mean_terminated_length": 213.65625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.5575160384178162, + "epoch": 1.221813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9100488001028793, + "kl": 0.051423899829387665, + "learning_rate": 7.406895362348915e-07, + "loss": -0.0139, + "num_tokens": 31609169.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.3358298540115356, + "sampling/importance_sampling_ratio/mean": 0.9997066855430603, + "sampling/importance_sampling_ratio/min": 0.6651027202606201, + "sampling/sampling_logp_difference/max": 0.40781378746032715, + "sampling/sampling_logp_difference/mean": 0.016100607812404633, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 200.9375, + "completions/mean_terminated_length": 200.9375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4968104660511017, + "epoch": 1.2230392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9513883949425093, + "kl": 0.041767850518226624, + "learning_rate": 7.400648827939671e-07, + "loss": 0.0249, + "num_tokens": 31640381.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5180104970932007, + "sampling/importance_sampling_ratio/mean": 0.9998517036437988, + "sampling/importance_sampling_ratio/min": 0.6954967379570007, + "sampling/sampling_logp_difference/max": 0.417400598526001, + "sampling/sampling_logp_difference/mean": 0.01566733419895172, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 209.125, + "completions/mean_terminated_length": 209.125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.5883033871650696, + "epoch": 1.224264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7851977292537808, + "kl": 0.04143001139163971, + "learning_rate": 7.394397420365392e-07, + "loss": -0.0112, + "num_tokens": 31672597.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.619869589805603, + "sampling/importance_sampling_ratio/mean": 1.0006663799285889, + "sampling/importance_sampling_ratio/min": 0.7099029421806335, + "sampling/sampling_logp_difference/max": 0.4823455810546875, + "sampling/sampling_logp_difference/mean": 0.017243143171072006, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 202.171875, + "completions/mean_terminated_length": 202.171875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3977816104888916, + "epoch": 1.2254901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8975597494126939, + "kl": 0.03055647574365139, + "learning_rate": 7.388141152316038e-07, + "loss": -0.0348, + "num_tokens": 31701616.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4754674434661865, + "sampling/importance_sampling_ratio/mean": 1.0006155967712402, + "sampling/importance_sampling_ratio/min": 0.6483342051506042, + "sampling/sampling_logp_difference/max": 0.4333488941192627, + "sampling/sampling_logp_difference/mean": 0.013678541406989098, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 166.359375, + "completions/mean_terminated_length": 166.359375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.39177465438842773, + "epoch": 1.2267156862745099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022641037487017263, + "kl": 0.039845582097768784, + "learning_rate": 7.381880036491439e-07, + "loss": 0.0004, + "num_tokens": 31725047.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6429468393325806, + "sampling/importance_sampling_ratio/mean": 1.0002089738845825, + "sampling/importance_sampling_ratio/min": 0.7211967706680298, + "sampling/sampling_logp_difference/max": 0.4964914321899414, + "sampling/sampling_logp_difference/mean": 0.015742601826786995, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 229.71875, + "completions/mean_terminated_length": 229.71875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4799444377422333, + "epoch": 1.2279411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021503236883838844, + "kl": 0.03737611323595047, + "learning_rate": 7.375614085601264e-07, + "loss": 0.0004, + "num_tokens": 31760325.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.653969407081604, + "sampling/importance_sampling_ratio/mean": 0.9996362924575806, + "sampling/importance_sampling_ratio/min": 0.6833867430686951, + "sampling/sampling_logp_difference/max": 0.5031781196594238, + "sampling/sampling_logp_difference/mean": 0.016141796484589577, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 185.0, + "completions/mean_terminated_length": 185.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.464060515165329, + "epoch": 1.2291666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.84347200364607, + "kl": 0.06338971108198166, + "learning_rate": 7.369343312364993e-07, + "loss": 0.0107, + "num_tokens": 31787125.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.615753173828125, + "sampling/importance_sampling_ratio/mean": 1.0008978843688965, + "sampling/importance_sampling_ratio/min": 0.6864622235298157, + "sampling/sampling_logp_difference/max": 0.4798011779785156, + "sampling/sampling_logp_difference/mean": 0.015821874141693115, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 248.40625, + "completions/mean_terminated_length": 248.40625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.48409995436668396, + "epoch": 1.2303921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7596962203726745, + "kl": 0.03787213936448097, + "learning_rate": 7.363067729511901e-07, + "loss": -0.0062, + "num_tokens": 31823455.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4660547971725464, + "sampling/importance_sampling_ratio/mean": 0.9998261332511902, + "sampling/importance_sampling_ratio/min": 0.6262744665145874, + "sampling/sampling_logp_difference/max": 0.46796655654907227, + "sampling/sampling_logp_difference/mean": 0.014787048101425171, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 239.28125, + "completions/mean_terminated_length": 239.28125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4689325988292694, + "epoch": 1.2316176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01842944255374365, + "kl": 0.031861260533332825, + "learning_rate": 7.356787349781022e-07, + "loss": 0.0003, + "num_tokens": 31859329.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.603270173072815, + "sampling/importance_sampling_ratio/mean": 1.00017249584198, + "sampling/importance_sampling_ratio/min": 0.5374639630317688, + "sampling/sampling_logp_difference/max": 0.6208934783935547, + "sampling/sampling_logp_difference/mean": 0.016252703964710236, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 252.609375, + "completions/mean_terminated_length": 252.609375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.5425338745117188, + "epoch": 1.232843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7354271231791148, + "kl": 0.04957719147205353, + "learning_rate": 7.350502185921131e-07, + "loss": -0.0097, + "num_tokens": 31893688.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4298754930496216, + "sampling/importance_sampling_ratio/mean": 1.000070333480835, + "sampling/importance_sampling_ratio/min": 0.6960698962211609, + "sampling/sampling_logp_difference/max": 0.3623051643371582, + "sampling/sampling_logp_difference/mean": 0.016065813601017, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 218.328125, + "completions/mean_terminated_length": 218.328125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.38892823457717896, + "epoch": 1.2340686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018745895758316127, + "kl": 0.029336608946323395, + "learning_rate": 7.344212250690711e-07, + "loss": 0.0003, + "num_tokens": 31921821.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.390149474143982, + "sampling/importance_sampling_ratio/mean": 0.9999837875366211, + "sampling/importance_sampling_ratio/min": 0.709245502948761, + "sampling/sampling_logp_difference/max": 0.3435535430908203, + "sampling/sampling_logp_difference/mean": 0.013726888224482536, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 227.421875, + "completions/mean_terminated_length": 227.421875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4669730067253113, + "epoch": 1.2352941176470589, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0752558590695398, + "kl": 0.04439125582575798, + "learning_rate": 7.337917556857934e-07, + "loss": 0.023, + "num_tokens": 31957064.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.552883505821228, + "sampling/importance_sampling_ratio/mean": 0.999682605266571, + "sampling/importance_sampling_ratio/min": 0.6176310777664185, + "sampling/sampling_logp_difference/max": 0.48186397552490234, + "sampling/sampling_logp_difference/mean": 0.015025627799332142, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 224.5, + "completions/mean_terminated_length": 224.5, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.5444422364234924, + "epoch": 1.2365196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7670486189730752, + "kl": 0.06080947816371918, + "learning_rate": 7.331618117200625e-07, + "loss": -0.0069, + "num_tokens": 31991240.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4498920440673828, + "sampling/importance_sampling_ratio/mean": 0.9998273849487305, + "sampling/importance_sampling_ratio/min": 0.6877607703208923, + "sampling/sampling_logp_difference/max": 0.37431418895721436, + "sampling/sampling_logp_difference/mean": 0.017048103734850883, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 232.96875, + "completions/mean_terminated_length": 232.96875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.33657899498939514, + "epoch": 1.2377450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019503945983382134, + "kl": 0.030245546251535416, + "learning_rate": 7.325313944506253e-07, + "loss": 0.0003, + "num_tokens": 32025974.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.439969539642334, + "sampling/importance_sampling_ratio/mean": 1.0001139640808105, + "sampling/importance_sampling_ratio/min": 0.6171409487724304, + "sampling/sampling_logp_difference/max": 0.48265790939331055, + "sampling/sampling_logp_difference/mean": 0.011354037560522556, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 213.109375, + "completions/mean_terminated_length": 213.109375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3115587830543518, + "epoch": 1.2389705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01905305858685132, + "kl": 0.02667442336678505, + "learning_rate": 7.319005051571885e-07, + "loss": 0.0002, + "num_tokens": 32054765.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.518096923828125, + "sampling/importance_sampling_ratio/mean": 0.9998236894607544, + "sampling/importance_sampling_ratio/min": 0.7007561326026917, + "sampling/sampling_logp_difference/max": 0.41745758056640625, + "sampling/sampling_logp_difference/mean": 0.012191656976938248, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 259.234375, + "completions/mean_terminated_length": 259.234375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4942418336868286, + "epoch": 1.2401960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022053430549242234, + "kl": 0.03949630260467529, + "learning_rate": 7.312691451204177e-07, + "loss": 0.0004, + "num_tokens": 32091644.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4972546100616455, + "sampling/importance_sampling_ratio/mean": 1.0000078678131104, + "sampling/importance_sampling_ratio/min": 0.6231526136398315, + "sampling/sampling_logp_difference/max": 0.472963809967041, + "sampling/sampling_logp_difference/mean": 0.015944818034768105, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 215.09375, + "completions/mean_terminated_length": 215.09375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.35535722970962524, + "epoch": 1.241421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7438780241335894, + "kl": 0.04291858524084091, + "learning_rate": 7.306373156219335e-07, + "loss": 0.0085, + "num_tokens": 32119954.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.298388123512268, + "sampling/importance_sampling_ratio/mean": 0.9999449253082275, + "sampling/importance_sampling_ratio/min": 0.6209536790847778, + "sampling/sampling_logp_difference/max": 0.4764988422393799, + "sampling/sampling_logp_difference/mean": 0.012562550604343414, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 294.015625, + "completions/mean_terminated_length": 294.015625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.5063092112541199, + "epoch": 1.2426470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020977379822758387, + "kl": 0.04146093875169754, + "learning_rate": 7.300050179443099e-07, + "loss": 0.0004, + "num_tokens": 32158579.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3271468877792358, + "sampling/importance_sampling_ratio/mean": 1.000190258026123, + "sampling/importance_sampling_ratio/min": 0.6283326745033264, + "sampling/sampling_logp_difference/max": 0.4646855592727661, + "sampling/sampling_logp_difference/mean": 0.015025531873106956, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 291.453125, + "completions/mean_terminated_length": 291.453125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.5707547664642334, + "epoch": 1.2438725490196079, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.2276676590333753, + "kl": 0.04638542979955673, + "learning_rate": 7.293722533710714e-07, + "loss": 0.0127, + "num_tokens": 32211472.0, + "reward": 0.5, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7457033395767212, + "sampling/importance_sampling_ratio/mean": 0.9994102716445923, + "sampling/importance_sampling_ratio/min": 0.6368858218193054, + "sampling/sampling_logp_difference/max": 0.5571575164794922, + "sampling/sampling_logp_difference/mean": 0.01785561442375183, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 252.140625, + "completions/mean_terminated_length": 252.140625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.43048757314682007, + "epoch": 1.2450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025821215352880268, + "kl": 0.0424998477101326, + "learning_rate": 7.287390231866893e-07, + "loss": 0.0004, + "num_tokens": 32243289.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6037300825119019, + "sampling/importance_sampling_ratio/mean": 1.0000109672546387, + "sampling/importance_sampling_ratio/min": 0.6250476837158203, + "sampling/sampling_logp_difference/max": 0.472332239151001, + "sampling/sampling_logp_difference/mean": 0.0146937882527709, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 253.90625, + "completions/mean_terminated_length": 253.90625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.42356592416763306, + "epoch": 1.2463235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6471867008574946, + "kl": 0.05514194071292877, + "learning_rate": 7.281053286765815e-07, + "loss": 0.0035, + "num_tokens": 32277747.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.464868426322937, + "sampling/importance_sampling_ratio/mean": 1.0003128051757812, + "sampling/importance_sampling_ratio/min": 0.703257143497467, + "sampling/sampling_logp_difference/max": 0.38176536560058594, + "sampling/sampling_logp_difference/mean": 0.01333966851234436, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 290.984375, + "completions/mean_terminated_length": 290.984375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.49442100524902344, + "epoch": 1.2475490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6719842922456302, + "kl": 0.03633398562669754, + "learning_rate": 7.274711711271073e-07, + "loss": 0.0373, + "num_tokens": 32313634.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.445326566696167, + "sampling/importance_sampling_ratio/mean": 0.999626100063324, + "sampling/importance_sampling_ratio/min": 0.578313946723938, + "sampling/sampling_logp_difference/max": 0.5476384162902832, + "sampling/sampling_logp_difference/mean": 0.01476279366761446, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 216.84375, + "completions/mean_terminated_length": 216.84375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.43948325514793396, + "epoch": 1.2487745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02685388704425198, + "kl": 0.04969961941242218, + "learning_rate": 7.268365518255665e-07, + "loss": 0.0005, + "num_tokens": 32343144.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4139961004257202, + "sampling/importance_sampling_ratio/mean": 1.0000815391540527, + "sampling/importance_sampling_ratio/min": 0.6436682939529419, + "sampling/sampling_logp_difference/max": 0.44057178497314453, + "sampling/sampling_logp_difference/mean": 0.015196645632386208, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 302.8125, + "completions/mean_terminated_length": 302.8125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.5333965420722961, + "epoch": 1.25, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5883894213341854, + "kl": 0.06262734532356262, + "learning_rate": 7.262014720601958e-07, + "loss": 0.0001, + "num_tokens": 32390236.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.4623440504074097, + "sampling/importance_sampling_ratio/mean": 1.0003504753112793, + "sampling/importance_sampling_ratio/min": 0.6772839426994324, + "sampling/sampling_logp_difference/max": 0.3896646499633789, + "sampling/sampling_logp_difference/mean": 0.015126075595617294, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 307.109375, + "completions/mean_terminated_length": 307.109375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.4063161611557007, + "epoch": 1.2512254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021609550166476525, + "kl": 0.03785210847854614, + "learning_rate": 7.255659331201673e-07, + "loss": 0.0004, + "num_tokens": 32431299.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001342296600342, + "sampling/importance_sampling_ratio/min": 0.6368669271469116, + "sampling/sampling_logp_difference/max": 0.7420744895935059, + "sampling/sampling_logp_difference/mean": 0.012437833473086357, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 265.765625, + "completions/mean_terminated_length": 265.765625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.5111925601959229, + "epoch": 1.2524509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6003328720428629, + "kl": 0.06019875407218933, + "learning_rate": 7.249299362955845e-07, + "loss": 0.0213, + "num_tokens": 32469892.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.318848729133606, + "sampling/importance_sampling_ratio/mean": 1.0000724792480469, + "sampling/importance_sampling_ratio/min": 0.679656982421875, + "sampling/sampling_logp_difference/max": 0.386167049407959, + "sampling/sampling_logp_difference/mean": 0.01651834324002266, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 253.671875, + "completions/mean_terminated_length": 253.671875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.5959911346435547, + "epoch": 1.2536764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9339709076519227, + "kl": 0.08829143643379211, + "learning_rate": 7.242934828774808e-07, + "loss": -0.0129, + "num_tokens": 32509711.0, + "reward": 0.125, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6276863813400269, + "sampling/importance_sampling_ratio/mean": 1.0000534057617188, + "sampling/importance_sampling_ratio/min": 0.6988682150840759, + "sampling/sampling_logp_difference/max": 0.4871596097946167, + "sampling/sampling_logp_difference/mean": 0.018192298710346222, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 330.625, + "completions/mean_terminated_length": 330.625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.5745341777801514, + "epoch": 1.2549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7087028040316709, + "kl": 0.053980231285095215, + "learning_rate": 7.236565741578162e-07, + "loss": 0.0365, + "num_tokens": 32551031.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996329545974731, + "sampling/importance_sampling_ratio/min": 0.6725244522094727, + "sampling/sampling_logp_difference/max": 0.850208044052124, + "sampling/sampling_logp_difference/mean": 0.016945503652095795, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 269.203125, + "completions/mean_terminated_length": 269.203125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.39986565709114075, + "epoch": 1.2561274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025664668018526356, + "kl": 0.040328651666641235, + "learning_rate": 7.230192114294753e-07, + "loss": 0.0004, + "num_tokens": 32585988.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5961456298828125, + "sampling/importance_sampling_ratio/mean": 0.9999529719352722, + "sampling/importance_sampling_ratio/min": 0.6376746892929077, + "sampling/sampling_logp_difference/max": 0.4675917625427246, + "sampling/sampling_logp_difference/mean": 0.013972658663988113, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 359.25, + "completions/mean_terminated_length": 359.25, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.3309788107872009, + "epoch": 1.2573529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01775832964892254, + "kl": 0.024740155786275864, + "learning_rate": 7.223813959862638e-07, + "loss": 0.0002, + "num_tokens": 32624676.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4058010578155518, + "sampling/importance_sampling_ratio/mean": 1.0001771450042725, + "sampling/importance_sampling_ratio/min": 0.6491348147392273, + "sampling/sampling_logp_difference/max": 0.432114839553833, + "sampling/sampling_logp_difference/mean": 0.01120903342962265, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 253.078125, + "completions/mean_terminated_length": 253.078125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.41706496477127075, + "epoch": 1.258578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026393378659487716, + "kl": 0.04604463651776314, + "learning_rate": 7.217431291229067e-07, + "loss": 0.0004, + "num_tokens": 32660809.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5524452924728394, + "sampling/importance_sampling_ratio/mean": 1.0001442432403564, + "sampling/importance_sampling_ratio/min": 0.6628890633583069, + "sampling/sampling_logp_difference/max": 0.4398312568664551, + "sampling/sampling_logp_difference/mean": 0.013960368931293488, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 208.046875, + "completions/mean_terminated_length": 208.046875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.36943647265434265, + "epoch": 1.2598039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03732666976560762, + "kl": 0.04475586861371994, + "learning_rate": 7.211044121350454e-07, + "loss": 0.0004, + "num_tokens": 32690396.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2994472980499268, + "sampling/importance_sampling_ratio/mean": 0.9995710849761963, + "sampling/importance_sampling_ratio/min": 0.6124758124351501, + "sampling/sampling_logp_difference/max": 0.4902458190917969, + "sampling/sampling_logp_difference/mean": 0.013447067700326443, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 247.09375, + "completions/mean_terminated_length": 247.09375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.42932724952697754, + "epoch": 1.2610294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02541340858929853, + "kl": 0.03734192997217178, + "learning_rate": 7.204652463192347e-07, + "loss": 0.0004, + "num_tokens": 32726786.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3745006322860718, + "sampling/importance_sampling_ratio/mean": 0.9998223781585693, + "sampling/importance_sampling_ratio/min": 0.6333354115486145, + "sampling/sampling_logp_difference/max": 0.4567551612854004, + "sampling/sampling_logp_difference/mean": 0.014694184064865112, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 300.59375, + "completions/mean_terminated_length": 300.59375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.46342897415161133, + "epoch": 1.2622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02649329390311697, + "kl": 0.052085746079683304, + "learning_rate": 7.198256329729411e-07, + "loss": 0.0005, + "num_tokens": 32768632.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.527761459350586, + "sampling/importance_sampling_ratio/mean": 1.000121831893921, + "sampling/importance_sampling_ratio/min": 0.5563894510269165, + "sampling/sampling_logp_difference/max": 0.5862867832183838, + "sampling/sampling_logp_difference/mean": 0.014383465051651001, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.0, + "completions/max_terminated_length": 596.0, + "completions/mean_length": 225.984375, + "completions/mean_terminated_length": 225.984375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3839717507362366, + "epoch": 1.2634803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7988206397905827, + "kl": 0.043448030948638916, + "learning_rate": 7.191855733945386e-07, + "loss": -0.0103, + "num_tokens": 32808407.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9693922996520996, + "sampling/importance_sampling_ratio/mean": 1.0001676082611084, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.677725076675415, + "sampling/sampling_logp_difference/mean": 0.013649694621562958, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 218.484375, + "completions/mean_terminated_length": 218.484375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4352691173553467, + "epoch": 1.2647058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0543315479902784, + "kl": 0.0602213591337204, + "learning_rate": 7.185450688833083e-07, + "loss": 0.041, + "num_tokens": 32839110.0, + "reward": 0.75, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3956884145736694, + "sampling/importance_sampling_ratio/mean": 0.9996954202651978, + "sampling/importance_sampling_ratio/min": 0.6771960854530334, + "sampling/sampling_logp_difference/max": 0.38979434967041016, + "sampling/sampling_logp_difference/mean": 0.015575871802866459, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 266.15625, + "completions/mean_terminated_length": 266.15625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.4167899787425995, + "epoch": 1.2659313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025274840811054574, + "kl": 0.03985028713941574, + "learning_rate": 7.179041207394331e-07, + "loss": 0.0004, + "num_tokens": 32873392.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5278154611587524, + "sampling/importance_sampling_ratio/mean": 0.9999694228172302, + "sampling/importance_sampling_ratio/min": 0.6771520972251892, + "sampling/sampling_logp_difference/max": 0.42383885383605957, + "sampling/sampling_logp_difference/mean": 0.014345312491059303, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 240.25, + "completions/mean_terminated_length": 240.25, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4263748228549957, + "epoch": 1.267156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023683970665192327, + "kl": 0.03402005881071091, + "learning_rate": 7.172627302639975e-07, + "loss": 0.0003, + "num_tokens": 32912480.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5070692300796509, + "sampling/importance_sampling_ratio/mean": 0.9993304014205933, + "sampling/importance_sampling_ratio/min": 0.5725120902061462, + "sampling/sampling_logp_difference/max": 0.5577214956283569, + "sampling/sampling_logp_difference/mean": 0.014584648422896862, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 283.515625, + "completions/mean_terminated_length": 283.515625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.4535956084728241, + "epoch": 1.2683823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031473450565118265, + "kl": 0.03707154840230942, + "learning_rate": 7.166208987589836e-07, + "loss": 0.0004, + "num_tokens": 32946033.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4811121225357056, + "sampling/importance_sampling_ratio/mean": 1.0002422332763672, + "sampling/importance_sampling_ratio/min": 0.5675489902496338, + "sampling/sampling_logp_difference/max": 0.5664281845092773, + "sampling/sampling_logp_difference/mean": 0.01619875617325306, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 241.765625, + "completions/mean_terminated_length": 241.765625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.2822660803794861, + "epoch": 1.2696078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022556727140358727, + "kl": 0.02592044323682785, + "learning_rate": 7.159786275272686e-07, + "loss": 0.0003, + "num_tokens": 32976562.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.597345232963562, + "sampling/importance_sampling_ratio/mean": 1.0000855922698975, + "sampling/importance_sampling_ratio/min": 0.6534473299980164, + "sampling/sampling_logp_difference/max": 0.46834301948547363, + "sampling/sampling_logp_difference/mean": 0.012253960594534874, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 212.359375, + "completions/mean_terminated_length": 212.359375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.37916967272758484, + "epoch": 1.2708333333333333, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7746878225115345, + "kl": 0.04545515775680542, + "learning_rate": 7.153359178726221e-07, + "loss": 0.0251, + "num_tokens": 33004969.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5469253063201904, + "sampling/importance_sampling_ratio/mean": 0.9998739957809448, + "sampling/importance_sampling_ratio/min": 0.6379643082618713, + "sampling/sampling_logp_difference/max": 0.44947290420532227, + "sampling/sampling_logp_difference/mean": 0.015152151696383953, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 224.0625, + "completions/mean_terminated_length": 224.0625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.453823447227478, + "epoch": 1.2720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027001662920450437, + "kl": 0.039575982838869095, + "learning_rate": 7.146927710997046e-07, + "loss": 0.0004, + "num_tokens": 33036461.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3945215940475464, + "sampling/importance_sampling_ratio/mean": 0.9998276233673096, + "sampling/importance_sampling_ratio/min": 0.6896499991416931, + "sampling/sampling_logp_difference/max": 0.37157106399536133, + "sampling/sampling_logp_difference/mean": 0.01585349440574646, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 253.953125, + "completions/mean_terminated_length": 253.953125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.32050231099128723, + "epoch": 1.2732843137254901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02010016021050216, + "kl": 0.029850102961063385, + "learning_rate": 7.140491885140628e-07, + "loss": 0.0003, + "num_tokens": 33068666.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5174421072006226, + "sampling/importance_sampling_ratio/mean": 1.000077247619629, + "sampling/importance_sampling_ratio/min": 0.6370596289634705, + "sampling/sampling_logp_difference/max": 0.45089197158813477, + "sampling/sampling_logp_difference/mean": 0.013122981414198875, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 246.8125, + "completions/mean_terminated_length": 246.8125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.425956130027771, + "epoch": 1.2745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022397279953500417, + "kl": 0.03901214897632599, + "learning_rate": 7.134051714221286e-07, + "loss": 0.0003, + "num_tokens": 33104446.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.581176996231079, + "sampling/importance_sampling_ratio/mean": 1.0002460479736328, + "sampling/importance_sampling_ratio/min": 0.616258978843689, + "sampling/sampling_logp_difference/max": 0.4840879440307617, + "sampling/sampling_logp_difference/mean": 0.015681080520153046, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 188.234375, + "completions/mean_terminated_length": 188.234375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.2762240469455719, + "epoch": 1.2757352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024447707372076348, + "kl": 0.02437257394194603, + "learning_rate": 7.127607211312162e-07, + "loss": 0.0002, + "num_tokens": 33130941.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4655163288116455, + "sampling/importance_sampling_ratio/mean": 1.0001275539398193, + "sampling/importance_sampling_ratio/min": 0.6214760541915894, + "sampling/sampling_logp_difference/max": 0.47565793991088867, + "sampling/sampling_logp_difference/mean": 0.012499706819653511, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 230.859375, + "completions/mean_terminated_length": 230.859375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.302249550819397, + "epoch": 1.2769607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02441438786301167, + "kl": 0.026303227990865707, + "learning_rate": 7.121158389495185e-07, + "loss": 0.0002, + "num_tokens": 33161908.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4328745603561401, + "sampling/importance_sampling_ratio/mean": 0.9998009204864502, + "sampling/importance_sampling_ratio/min": 0.6068763732910156, + "sampling/sampling_logp_difference/max": 0.49943017959594727, + "sampling/sampling_logp_difference/mean": 0.01246584951877594, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 252.6875, + "completions/mean_terminated_length": 252.6875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4117159843444824, + "epoch": 1.278186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9586207423221864, + "kl": 0.04804814234375954, + "learning_rate": 7.114705261861061e-07, + "loss": 0.0135, + "num_tokens": 33202896.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.8996422290802002, + "sampling/importance_sampling_ratio/mean": 0.9999984502792358, + "sampling/importance_sampling_ratio/min": 0.5023150444030762, + "sampling/sampling_logp_difference/max": 0.6885278224945068, + "sampling/sampling_logp_difference/mean": 0.016012828797101974, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 243.59375, + "completions/mean_terminated_length": 243.59375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2845577597618103, + "epoch": 1.2794117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8391865100951481, + "kl": 0.03303593397140503, + "learning_rate": 7.108247841509222e-07, + "loss": -0.034, + "num_tokens": 33231094.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.627860426902771, + "sampling/importance_sampling_ratio/mean": 0.9997678995132446, + "sampling/importance_sampling_ratio/min": 0.644490122795105, + "sampling/sampling_logp_difference/max": 0.48726654052734375, + "sampling/sampling_logp_difference/mean": 0.011671999469399452, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 182.859375, + "completions/mean_terminated_length": 182.859375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3172720968723297, + "epoch": 1.280637254901961, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5413983239586968, + "kl": 0.05567832291126251, + "learning_rate": 7.101786141547828e-07, + "loss": 0.0256, + "num_tokens": 33258365.0, + "reward": 0.0, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6092875003814697, + "sampling/importance_sampling_ratio/mean": 1.000196933746338, + "sampling/importance_sampling_ratio/min": 0.6497927308082581, + "sampling/sampling_logp_difference/max": 0.47579145431518555, + "sampling/sampling_logp_difference/mean": 0.013540423475205898, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 218.59375, + "completions/mean_terminated_length": 218.59375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.32802873849868774, + "epoch": 1.281862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025774050768263553, + "kl": 0.024461662396788597, + "learning_rate": 7.095320175093718e-07, + "loss": 0.0002, + "num_tokens": 33287971.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.550926923751831, + "sampling/importance_sampling_ratio/mean": 0.9997495412826538, + "sampling/importance_sampling_ratio/min": 0.6130277514457703, + "sampling/sampling_logp_difference/max": 0.48934507369995117, + "sampling/sampling_logp_difference/mean": 0.013812687247991562, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 158.9375, + "completions/mean_terminated_length": 158.9375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.24623757600784302, + "epoch": 1.2830882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031035160236915355, + "kl": 0.029723387211561203, + "learning_rate": 7.088849955272396e-07, + "loss": 0.0003, + "num_tokens": 33313935.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3437520265579224, + "sampling/importance_sampling_ratio/mean": 0.9991786479949951, + "sampling/importance_sampling_ratio/min": 0.6279453039169312, + "sampling/sampling_logp_difference/max": 0.4653022289276123, + "sampling/sampling_logp_difference/mean": 0.011917706578969955, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 157.765625, + "completions/mean_terminated_length": 157.765625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3262321949005127, + "epoch": 1.284313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03942257101959648, + "kl": 0.041421011090278625, + "learning_rate": 7.082375495217995e-07, + "loss": 0.0004, + "num_tokens": 33339824.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4285696744918823, + "sampling/importance_sampling_ratio/mean": 1.0001627206802368, + "sampling/importance_sampling_ratio/min": 0.6262629628181458, + "sampling/sampling_logp_difference/max": 0.4679849147796631, + "sampling/sampling_logp_difference/mean": 0.015121504664421082, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 196.671875, + "completions/mean_terminated_length": 196.671875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.32499948143959045, + "epoch": 1.2855392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03905924470577792, + "kl": 0.031079092994332314, + "learning_rate": 7.075896808073263e-07, + "loss": 0.0003, + "num_tokens": 33370571.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5940988063812256, + "sampling/importance_sampling_ratio/mean": 1.000136375427246, + "sampling/importance_sampling_ratio/min": 0.6450546383857727, + "sampling/sampling_logp_difference/max": 0.46630859375, + "sampling/sampling_logp_difference/mean": 0.014183073304593563, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 189.15625, + "completions/mean_terminated_length": 189.15625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.33075258135795593, + "epoch": 1.2867647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8668856395806319, + "kl": 0.052480727434158325, + "learning_rate": 7.069413906989523e-07, + "loss": 0.013, + "num_tokens": 33400869.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5986402034759521, + "sampling/importance_sampling_ratio/mean": 0.9995260834693909, + "sampling/importance_sampling_ratio/min": 0.6558920741081238, + "sampling/sampling_logp_difference/max": 0.46915340423583984, + "sampling/sampling_logp_difference/mean": 0.013863028958439827, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 188.609375, + "completions/mean_terminated_length": 188.609375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3841259479522705, + "epoch": 1.2879901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9909333990221172, + "kl": 0.05324501544237137, + "learning_rate": 7.062926805126652e-07, + "loss": 0.0465, + "num_tokens": 33431148.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5077095031738281, + "sampling/importance_sampling_ratio/mean": 1.0002236366271973, + "sampling/importance_sampling_ratio/min": 0.6441048383712769, + "sampling/sampling_logp_difference/max": 0.4398937225341797, + "sampling/sampling_logp_difference/mean": 0.016419682651758194, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 180.4375, + "completions/mean_terminated_length": 180.4375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.390139639377594, + "epoch": 1.2892156862745099, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0680609818372886, + "kl": 0.04708288609981537, + "learning_rate": 7.056435515653058e-07, + "loss": -0.048, + "num_tokens": 33459352.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4656306505203247, + "sampling/importance_sampling_ratio/mean": 1.0000035762786865, + "sampling/importance_sampling_ratio/min": 0.5664933919906616, + "sampling/sampling_logp_difference/max": 0.5682897567749023, + "sampling/sampling_logp_difference/mean": 0.01614879071712494, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 179.28125, + "completions/mean_terminated_length": 179.28125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.38631048798561096, + "epoch": 1.2904411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4400635236264094, + "kl": 0.06664882600307465, + "learning_rate": 7.049940051745646e-07, + "loss": 0.0118, + "num_tokens": 33486490.0, + "reward": 0.21875, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4916131496429443, + "sampling/importance_sampling_ratio/mean": 0.9997199177742004, + "sampling/importance_sampling_ratio/min": 0.6213639974594116, + "sampling/sampling_logp_difference/max": 0.47583818435668945, + "sampling/sampling_logp_difference/mean": 0.01591946929693222, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 188.8125, + "completions/mean_terminated_length": 188.8125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3945600986480713, + "epoch": 1.2916666666666667, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2811574243731971, + "kl": 0.060895487666130066, + "learning_rate": 7.043440426589795e-07, + "loss": -0.0234, + "num_tokens": 33520894.0, + "reward": -0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5335569381713867, + "sampling/importance_sampling_ratio/mean": 1.0000795125961304, + "sampling/importance_sampling_ratio/min": 0.6764960885047913, + "sampling/sampling_logp_difference/max": 0.42758989334106445, + "sampling/sampling_logp_difference/mean": 0.015397395007312298, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 169.578125, + "completions/mean_terminated_length": 169.578125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.2863377630710602, + "epoch": 1.2928921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9757538928484449, + "kl": 0.05878036841750145, + "learning_rate": 7.036936653379335e-07, + "loss": 0.0069, + "num_tokens": 33550339.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.422853708267212, + "sampling/importance_sampling_ratio/mean": 0.9996306300163269, + "sampling/importance_sampling_ratio/min": 0.6298396587371826, + "sampling/sampling_logp_difference/max": 0.46229004859924316, + "sampling/sampling_logp_difference/mean": 0.013411764986813068, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 195.890625, + "completions/mean_terminated_length": 195.890625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3994911313056946, + "epoch": 1.2941176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8204277477205849, + "kl": 0.06238219887018204, + "learning_rate": 7.030428745316512e-07, + "loss": 0.015, + "num_tokens": 33585308.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5112587213516235, + "sampling/importance_sampling_ratio/mean": 1.0002164840698242, + "sampling/importance_sampling_ratio/min": 0.6720823645591736, + "sampling/sampling_logp_difference/max": 0.41294288635253906, + "sampling/sampling_logp_difference/mean": 0.015088235959410667, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.35088345408439636, + "epoch": 1.295343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8394822331891233, + "kl": 0.05312206968665123, + "learning_rate": 7.023916715611968e-07, + "loss": 0.046, + "num_tokens": 33618166.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.642237901687622, + "sampling/importance_sampling_ratio/mean": 1.000025987625122, + "sampling/importance_sampling_ratio/min": 0.5948571562767029, + "sampling/sampling_logp_difference/max": 0.5194339752197266, + "sampling/sampling_logp_difference/mean": 0.014569773338735104, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 198.328125, + "completions/mean_terminated_length": 198.328125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.43649303913116455, + "epoch": 1.2965686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06133202697411687, + "kl": 0.06992170214653015, + "learning_rate": 7.017400577484712e-07, + "loss": 0.0007, + "num_tokens": 33647435.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4692312479019165, + "sampling/importance_sampling_ratio/mean": 0.9996936321258545, + "sampling/importance_sampling_ratio/min": 0.26469555497169495, + "sampling/sampling_logp_difference/max": 1.3291749954223633, + "sampling/sampling_logp_difference/mean": 0.0163760744035244, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 167.84375, + "completions/mean_terminated_length": 167.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3744012117385864, + "epoch": 1.2977941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7941412030978372, + "kl": 0.04241356998682022, + "learning_rate": 7.010880344162086e-07, + "loss": 0.019, + "num_tokens": 33677073.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5307191610336304, + "sampling/importance_sampling_ratio/mean": 0.9998902678489685, + "sampling/importance_sampling_ratio/min": 0.6202341318130493, + "sampling/sampling_logp_difference/max": 0.4776582717895508, + "sampling/sampling_logp_difference/mean": 0.01614062674343586, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 192.03125, + "completions/mean_terminated_length": 192.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.36728131771087646, + "epoch": 1.2990196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9322385344088944, + "kl": 0.03659145534038544, + "learning_rate": 7.004356028879758e-07, + "loss": -0.0027, + "num_tokens": 33709619.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4058219194412231, + "sampling/importance_sampling_ratio/mean": 0.9997914433479309, + "sampling/importance_sampling_ratio/min": 0.5355716347694397, + "sampling/sampling_logp_difference/max": 0.6244206428527832, + "sampling/sampling_logp_difference/mean": 0.01563429832458496, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 151.640625, + "completions/mean_terminated_length": 151.640625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3456276059150696, + "epoch": 1.3002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1141161126383432, + "kl": 0.05363607406616211, + "learning_rate": 6.99782764488167e-07, + "loss": 0.0238, + "num_tokens": 33738236.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5828100442886353, + "sampling/importance_sampling_ratio/mean": 0.9999970197677612, + "sampling/importance_sampling_ratio/min": 0.6303369998931885, + "sampling/sampling_logp_difference/max": 0.4615006446838379, + "sampling/sampling_logp_difference/mean": 0.015090061351656914, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 171.609375, + "completions/mean_terminated_length": 171.609375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.27703601121902466, + "epoch": 1.3014705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026881586740977955, + "kl": 0.029022444039583206, + "learning_rate": 6.991295205420027e-07, + "loss": 0.0003, + "num_tokens": 33766995.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4080665111541748, + "sampling/importance_sampling_ratio/mean": 1.0002038478851318, + "sampling/importance_sampling_ratio/min": 0.583820641040802, + "sampling/sampling_logp_difference/max": 0.5381613969802856, + "sampling/sampling_logp_difference/mean": 0.011404757387936115, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 179.984375, + "completions/mean_terminated_length": 179.984375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4201258420944214, + "epoch": 1.3026960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.052123880280025646, + "kl": 0.0661260113120079, + "learning_rate": 6.984758723755272e-07, + "loss": 0.0006, + "num_tokens": 33797682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6265360116958618, + "sampling/importance_sampling_ratio/mean": 1.0000171661376953, + "sampling/importance_sampling_ratio/min": 0.7134343981742859, + "sampling/sampling_logp_difference/max": 0.486452579498291, + "sampling/sampling_logp_difference/mean": 0.016265347599983215, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 184.578125, + "completions/mean_terminated_length": 184.578125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4626278877258301, + "epoch": 1.303921568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.41291532247678, + "kl": 0.07594802230596542, + "learning_rate": 6.978218213156044e-07, + "loss": -0.034, + "num_tokens": 33824631.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.7275714874267578, + "sampling/importance_sampling_ratio/mean": 1.000497817993164, + "sampling/importance_sampling_ratio/min": 0.6479448676109314, + "sampling/sampling_logp_difference/max": 0.5467166900634766, + "sampling/sampling_logp_difference/mean": 0.017567459493875504, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 181.0, + "completions/mean_terminated_length": 181.0, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.42885833978652954, + "epoch": 1.3051470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.268876871042677, + "kl": 0.06140081584453583, + "learning_rate": 6.971673686899169e-07, + "loss": 0.0301, + "num_tokens": 33855047.0, + "reward": 0.28125, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.469535231590271, + "sampling/importance_sampling_ratio/mean": 0.9998229742050171, + "sampling/importance_sampling_ratio/min": 0.6435920000076294, + "sampling/sampling_logp_difference/max": 0.440690279006958, + "sampling/sampling_logp_difference/mean": 0.01758875697851181, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 159.421875, + "completions/mean_terminated_length": 159.421875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.36572587490081787, + "epoch": 1.3063725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047132954323472774, + "kl": 0.08620236068964005, + "learning_rate": 6.965125158269618e-07, + "loss": 0.0007, + "num_tokens": 33883106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 0.9998907446861267, + "sampling/importance_sampling_ratio/min": 0.6875842809677124, + "sampling/sampling_logp_difference/max": 0.4682176113128662, + "sampling/sampling_logp_difference/mean": 0.015279294922947884, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 170.5625, + "completions/mean_terminated_length": 170.5625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.39573433995246887, + "epoch": 1.3075980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0250018360360977, + "kl": 0.06333038210868835, + "learning_rate": 6.958572640560491e-07, + "loss": 0.0092, + "num_tokens": 33917190.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5846731662750244, + "sampling/importance_sampling_ratio/mean": 1.0000784397125244, + "sampling/importance_sampling_ratio/min": 0.6348634958267212, + "sampling/sampling_logp_difference/max": 0.46037817001342773, + "sampling/sampling_logp_difference/mean": 0.015109268017113209, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 169.53125, + "completions/mean_terminated_length": 169.53125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.2902347147464752, + "epoch": 1.3088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.999846818846809, + "kl": 0.04580019414424896, + "learning_rate": 6.952016147072981e-07, + "loss": 0.012, + "num_tokens": 33943336.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6195276975631714, + "sampling/importance_sampling_ratio/mean": 1.0002738237380981, + "sampling/importance_sampling_ratio/min": 0.6298382878303528, + "sampling/sampling_logp_difference/max": 0.4821345806121826, + "sampling/sampling_logp_difference/mean": 0.013744648545980453, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 168.203125, + "completions/mean_terminated_length": 168.203125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4487200379371643, + "epoch": 1.3100490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.370408774743456, + "kl": 0.08198067545890808, + "learning_rate": 6.945455691116358e-07, + "loss": 0.007, + "num_tokens": 33972485.0, + "reward": 0.625, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4868720769882202, + "sampling/importance_sampling_ratio/mean": 1.000361442565918, + "sampling/importance_sampling_ratio/min": 0.6132069826126099, + "sampling/sampling_logp_difference/max": 0.48905277252197266, + "sampling/sampling_logp_difference/mean": 0.01734788343310356, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 163.34375, + "completions/mean_terminated_length": 163.34375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.32821619510650635, + "epoch": 1.3112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024778477204538275, + "kl": 0.03450584411621094, + "learning_rate": 6.938891286007928e-07, + "loss": 0.0003, + "num_tokens": 34007067.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000243067741394, + "sampling/importance_sampling_ratio/min": 0.6840278506278992, + "sampling/sampling_logp_difference/max": 0.7153477668762207, + "sampling/sampling_logp_difference/mean": 0.01339616347104311, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 153.203125, + "completions/mean_terminated_length": 153.203125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.39119040966033936, + "epoch": 1.3125, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0528701708787427, + "kl": 0.056332677602767944, + "learning_rate": 6.932322945073023e-07, + "loss": 0.0085, + "num_tokens": 34031000.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.4968187808990479, + "sampling/importance_sampling_ratio/mean": 1.0003398656845093, + "sampling/importance_sampling_ratio/min": 0.6802636384963989, + "sampling/sampling_logp_difference/max": 0.40334200859069824, + "sampling/sampling_logp_difference/mean": 0.01601910963654518, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 158.25, + "completions/mean_terminated_length": 158.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3235864043235779, + "epoch": 1.3137254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02920336386591169, + "kl": 0.03633510693907738, + "learning_rate": 6.925750681644953e-07, + "loss": 0.0004, + "num_tokens": 34056744.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.616931676864624, + "sampling/importance_sampling_ratio/mean": 0.9995802044868469, + "sampling/importance_sampling_ratio/min": 0.6068379282951355, + "sampling/sampling_logp_difference/max": 0.4994935989379883, + "sampling/sampling_logp_difference/mean": 0.013510503806173801, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 169.90625, + "completions/mean_terminated_length": 169.90625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2885391414165497, + "epoch": 1.3149509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018540908878206534, + "kl": 0.023091096431016922, + "learning_rate": 6.919174509065003e-07, + "loss": 0.0002, + "num_tokens": 34094530.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3570696115493774, + "sampling/importance_sampling_ratio/mean": 1.000139832496643, + "sampling/importance_sampling_ratio/min": 0.7576091289520264, + "sampling/sampling_logp_difference/max": 0.3053276538848877, + "sampling/sampling_logp_difference/mean": 0.011862678453326225, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 218.53125, + "completions/mean_terminated_length": 218.53125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.379422664642334, + "epoch": 1.3161764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6157068019165124, + "kl": 0.06551677733659744, + "learning_rate": 6.91259444068238e-07, + "loss": -0.0096, + "num_tokens": 34126740.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.354239821434021, + "sampling/importance_sampling_ratio/mean": 0.999758243560791, + "sampling/importance_sampling_ratio/min": 0.6220086216926575, + "sampling/sampling_logp_difference/max": 0.47480130195617676, + "sampling/sampling_logp_difference/mean": 0.013810522854328156, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 211.46875, + "completions/mean_terminated_length": 211.46875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.549732506275177, + "epoch": 1.3174019607843137, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7921593436581686, + "kl": 0.0827806368470192, + "learning_rate": 6.906010489854209e-07, + "loss": -0.0191, + "num_tokens": 34162354.0, + "reward": -0.1875, + "reward_std": 0.6116957664489746, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.4058430194854736, + "sampling/importance_sampling_ratio/mean": 0.9995255470275879, + "sampling/importance_sampling_ratio/min": 0.6106529235839844, + "sampling/sampling_logp_difference/max": 0.4932265281677246, + "sampling/sampling_logp_difference/mean": 0.018393494188785553, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 179.625, + "completions/mean_terminated_length": 179.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.5191712379455566, + "epoch": 1.3186274509803921, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.315060820072996, + "kl": 0.0638878345489502, + "learning_rate": 6.899422669945493e-07, + "loss": -0.0188, + "num_tokens": 34191818.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.7806079387664795, + "sampling/importance_sampling_ratio/mean": 0.9991124272346497, + "sampling/importance_sampling_ratio/min": 0.627777636051178, + "sampling/sampling_logp_difference/max": 0.5769548416137695, + "sampling/sampling_logp_difference/mean": 0.01793564110994339, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 143.75, + "completions/mean_terminated_length": 143.75, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4007861614227295, + "epoch": 1.3198529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029734999155876355, + "kl": 0.036175504326820374, + "learning_rate": 6.892830994329088e-07, + "loss": 0.0004, + "num_tokens": 34221402.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2835105657577515, + "sampling/importance_sampling_ratio/mean": 1.0006650686264038, + "sampling/importance_sampling_ratio/min": 0.7054614424705505, + "sampling/sampling_logp_difference/max": 0.34890317916870117, + "sampling/sampling_logp_difference/mean": 0.015294120647013187, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 214.078125, + "completions/mean_terminated_length": 214.078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.5529769659042358, + "epoch": 1.321078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2654157102524206, + "kl": 0.0877370834350586, + "learning_rate": 6.886235476385681e-07, + "loss": -0.0188, + "num_tokens": 34253039.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.2814871072769165, + "sampling/importance_sampling_ratio/mean": 0.9998075366020203, + "sampling/importance_sampling_ratio/min": 0.7272739410400391, + "sampling/sampling_logp_difference/max": 0.3184521198272705, + "sampling/sampling_logp_difference/mean": 0.01777786575257778, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 146.078125, + "completions/mean_terminated_length": 146.078125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.46738529205322266, + "epoch": 1.3223039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9922101852402277, + "kl": 0.07144574820995331, + "learning_rate": 6.879636129503751e-07, + "loss": -0.004, + "num_tokens": 34282676.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6232999563217163, + "sampling/importance_sampling_ratio/mean": 1.000260353088379, + "sampling/importance_sampling_ratio/min": 0.6622359156608582, + "sampling/sampling_logp_difference/max": 0.48446106910705566, + "sampling/sampling_logp_difference/mean": 0.017098616808652878, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 160.453125, + "completions/mean_terminated_length": 160.453125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.447221040725708, + "epoch": 1.3235294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03038422686122859, + "kl": 0.034434664994478226, + "learning_rate": 6.87303296707956e-07, + "loss": 0.0004, + "num_tokens": 34314769.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0005841255187988, + "sampling/importance_sampling_ratio/min": 0.6435648798942566, + "sampling/sampling_logp_difference/max": 0.440732479095459, + "sampling/sampling_logp_difference/mean": 0.015548791736364365, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 170.0, + "completions/mean_terminated_length": 170.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4014641046524048, + "epoch": 1.3247549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2977414891724848, + "kl": 0.060253456234931946, + "learning_rate": 6.866426002517105e-07, + "loss": -0.0312, + "num_tokens": 34339185.0, + "reward": -0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.4831007719039917, + "sampling/importance_sampling_ratio/mean": 0.9999460577964783, + "sampling/importance_sampling_ratio/min": 0.6402031183242798, + "sampling/sampling_logp_difference/max": 0.445969820022583, + "sampling/sampling_logp_difference/mean": 0.014733843505382538, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 174.703125, + "completions/mean_terminated_length": 174.703125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3389003872871399, + "epoch": 1.3259803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1914942711509917, + "kl": 0.054510023444890976, + "learning_rate": 6.859815249228105e-07, + "loss": 0.0098, + "num_tokens": 34367022.0, + "reward": 0.625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5194233655929565, + "sampling/importance_sampling_ratio/mean": 1.0001471042633057, + "sampling/importance_sampling_ratio/min": 0.6015815138816833, + "sampling/sampling_logp_difference/max": 0.5081932544708252, + "sampling/sampling_logp_difference/mean": 0.012282421812415123, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 156.71875, + "completions/mean_terminated_length": 156.71875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.4586590826511383, + "epoch": 1.3272058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2045103896785339, + "kl": 0.041404061019420624, + "learning_rate": 6.853200720631972e-07, + "loss": 0.0132, + "num_tokens": 34392620.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.8330409526824951, + "sampling/importance_sampling_ratio/mean": 1.0000096559524536, + "sampling/importance_sampling_ratio/min": 0.6203129291534424, + "sampling/sampling_logp_difference/max": 0.6059763431549072, + "sampling/sampling_logp_difference/mean": 0.01719982549548149, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 158.015625, + "completions/mean_terminated_length": 158.015625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3859842121601105, + "epoch": 1.3284313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02756333372447915, + "kl": 0.04340790957212448, + "learning_rate": 6.846582430155781e-07, + "loss": 0.0004, + "num_tokens": 34417645.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.635543704032898, + "sampling/importance_sampling_ratio/mean": 1.0003108978271484, + "sampling/importance_sampling_ratio/min": 0.714081883430481, + "sampling/sampling_logp_difference/max": 0.4919753074645996, + "sampling/sampling_logp_difference/mean": 0.014954311773180962, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4257662892341614, + "epoch": 1.329656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02157836716451573, + "kl": 0.045886047184467316, + "learning_rate": 6.839960391234242e-07, + "loss": 0.0004, + "num_tokens": 34441799.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3246698379516602, + "sampling/importance_sampling_ratio/mean": 0.9998934268951416, + "sampling/importance_sampling_ratio/min": 0.6550384759902954, + "sampling/sampling_logp_difference/max": 0.4230612516403198, + "sampling/sampling_logp_difference/mean": 0.015735294669866562, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 210.5625, + "completions/mean_terminated_length": 210.5625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.34981435537338257, + "epoch": 1.3308823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01947714238593482, + "kl": 0.029721500352025032, + "learning_rate": 6.833334617309672e-07, + "loss": 0.0003, + "num_tokens": 34474811.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5748045444488525, + "sampling/importance_sampling_ratio/mean": 1.000112771987915, + "sampling/importance_sampling_ratio/min": 0.6208308339118958, + "sampling/sampling_logp_difference/max": 0.4766967296600342, + "sampling/sampling_logp_difference/mean": 0.014098942279815674, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 178.3125, + "completions/mean_terminated_length": 178.3125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.41365596652030945, + "epoch": 1.3321078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04542017220947505, + "kl": 0.06938691437244415, + "learning_rate": 6.826705121831976e-07, + "loss": 0.0007, + "num_tokens": 34502447.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.546083927154541, + "sampling/importance_sampling_ratio/mean": 1.0008903741836548, + "sampling/importance_sampling_ratio/min": 0.6928634643554688, + "sampling/sampling_logp_difference/max": 0.43572521209716797, + "sampling/sampling_logp_difference/mean": 0.014942721463739872, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 194.734375, + "completions/mean_terminated_length": 194.734375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.37932366132736206, + "epoch": 1.3333333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015491019048548296, + "kl": 0.03091152012348175, + "learning_rate": 6.820071918258605e-07, + "loss": 0.0003, + "num_tokens": 34533678.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3494160175323486, + "sampling/importance_sampling_ratio/mean": 1.0000969171524048, + "sampling/importance_sampling_ratio/min": 0.7206055521965027, + "sampling/sampling_logp_difference/max": 0.3276634216308594, + "sampling/sampling_logp_difference/mean": 0.015398137271404266, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 148.140625, + "completions/mean_terminated_length": 148.140625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3391225337982178, + "epoch": 1.3345588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.043334780455243624, + "kl": 0.040297143161296844, + "learning_rate": 6.813435020054548e-07, + "loss": 0.0004, + "num_tokens": 34558567.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.375964879989624, + "sampling/importance_sampling_ratio/mean": 1.0002107620239258, + "sampling/importance_sampling_ratio/min": 0.18915744125843048, + "sampling/sampling_logp_difference/max": 1.6651755571365356, + "sampling/sampling_logp_difference/mean": 0.013925185427069664, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 140.5625, + "completions/mean_terminated_length": 140.5625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.40812548995018005, + "epoch": 1.3357843137254901, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5176622119872871, + "kl": 0.06736913323402405, + "learning_rate": 6.806794440692282e-07, + "loss": -0.0521, + "num_tokens": 34584811.0, + "reward": 0.125, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.3238334655761719, + "sampling/importance_sampling_ratio/mean": 1.000195026397705, + "sampling/importance_sampling_ratio/min": 0.6368677020072937, + "sampling/sampling_logp_difference/max": 0.45119333267211914, + "sampling/sampling_logp_difference/mean": 0.015394306741654873, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 184.078125, + "completions/mean_terminated_length": 184.078125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.347525417804718, + "epoch": 1.3370098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019446197898854066, + "kl": 0.03219003975391388, + "learning_rate": 6.800150193651767e-07, + "loss": 0.0003, + "num_tokens": 34613328.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3008006811141968, + "sampling/importance_sampling_ratio/mean": 0.9996593594551086, + "sampling/importance_sampling_ratio/min": 0.6955578327178955, + "sampling/sampling_logp_difference/max": 0.3630410432815552, + "sampling/sampling_logp_difference/mean": 0.012894319370388985, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 187.265625, + "completions/mean_terminated_length": 187.265625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3292316198348999, + "epoch": 1.3382352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01643373329539923, + "kl": 0.029864570125937462, + "learning_rate": 6.793502292420401e-07, + "loss": 0.0003, + "num_tokens": 34641025.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4144617319107056, + "sampling/importance_sampling_ratio/mean": 0.9999804496765137, + "sampling/importance_sampling_ratio/min": 0.6217187643051147, + "sampling/sampling_logp_difference/max": 0.4752674102783203, + "sampling/sampling_logp_difference/mean": 0.012645787559449673, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 234.296875, + "completions/mean_terminated_length": 234.296875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5638883709907532, + "epoch": 1.3394607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1076845969024987, + "kl": 0.05605001002550125, + "learning_rate": 6.786850750493005e-07, + "loss": -0.0405, + "num_tokens": 34673972.0, + "reward": -0.15625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4678847789764404, + "sampling/importance_sampling_ratio/mean": 1.000309705734253, + "sampling/importance_sampling_ratio/min": 0.6138759255409241, + "sampling/sampling_logp_difference/max": 0.4879624843597412, + "sampling/sampling_logp_difference/mean": 0.017899105325341225, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 171.1875, + "completions/mean_terminated_length": 171.1875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3278416693210602, + "epoch": 1.340686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018262285100793196, + "kl": 0.03446147218346596, + "learning_rate": 6.780195581371784e-07, + "loss": 0.0003, + "num_tokens": 34698960.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4232503175735474, + "sampling/importance_sampling_ratio/mean": 1.0003902912139893, + "sampling/importance_sampling_ratio/min": 0.7128939032554626, + "sampling/sampling_logp_difference/max": 0.35294318199157715, + "sampling/sampling_logp_difference/mean": 0.01291445642709732, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 593.0, + "completions/max_terminated_length": 593.0, + "completions/mean_length": 194.828125, + "completions/mean_terminated_length": 194.828125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.34137165546417236, + "epoch": 1.3419117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014778505546742511, + "kl": 0.02961350977420807, + "learning_rate": 6.773536798566313e-07, + "loss": 0.0003, + "num_tokens": 34728965.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.441498875617981, + "sampling/importance_sampling_ratio/mean": 1.0001440048217773, + "sampling/importance_sampling_ratio/min": 0.7211413979530334, + "sampling/sampling_logp_difference/max": 0.3656834363937378, + "sampling/sampling_logp_difference/mean": 0.013568964786827564, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 219.421875, + "completions/mean_terminated_length": 219.421875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3275423049926758, + "epoch": 1.343137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011822894651897455, + "kl": 0.024325821548700333, + "learning_rate": 6.766874415593495e-07, + "loss": 0.0002, + "num_tokens": 34760336.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5737786293029785, + "sampling/importance_sampling_ratio/mean": 0.9997653365135193, + "sampling/importance_sampling_ratio/min": 0.6470595002174377, + "sampling/sampling_logp_difference/max": 0.453479528427124, + "sampling/sampling_logp_difference/mean": 0.01228781696408987, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 178.703125, + "completions/mean_terminated_length": 178.703125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.359050989151001, + "epoch": 1.344362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017683037402748594, + "kl": 0.035708922892808914, + "learning_rate": 6.760208445977549e-07, + "loss": 0.0003, + "num_tokens": 34786397.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2990641593933105, + "sampling/importance_sampling_ratio/mean": 1.000245451927185, + "sampling/importance_sampling_ratio/min": 0.6298472881317139, + "sampling/sampling_logp_difference/max": 0.462277889251709, + "sampling/sampling_logp_difference/mean": 0.013666082173585892, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 188.328125, + "completions/mean_terminated_length": 188.328125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4512159824371338, + "epoch": 1.3455882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.589649904007106, + "kl": 0.03694362938404083, + "learning_rate": 6.753538903249974e-07, + "loss": 0.0039, + "num_tokens": 34824578.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4352165460586548, + "sampling/importance_sampling_ratio/mean": 0.9999997019767761, + "sampling/importance_sampling_ratio/min": 0.638515830039978, + "sampling/sampling_logp_difference/max": 0.4486088752746582, + "sampling/sampling_logp_difference/mean": 0.017055150121450424, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.5191019773483276, + "epoch": 1.346813725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4883177965879764, + "kl": 0.05407078564167023, + "learning_rate": 6.74686580094951e-07, + "loss": -0.0447, + "num_tokens": 34855834.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4751386642456055, + "sampling/importance_sampling_ratio/mean": 1.0001651048660278, + "sampling/importance_sampling_ratio/min": 0.6531107425689697, + "sampling/sampling_logp_difference/max": 0.42600858211517334, + "sampling/sampling_logp_difference/mean": 0.017602190375328064, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 217.03125, + "completions/mean_terminated_length": 217.03125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.5169070959091187, + "epoch": 1.3480392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7540769944517511, + "kl": 0.04155392199754715, + "learning_rate": 6.740189152622142e-07, + "loss": -0.0131, + "num_tokens": 34889244.0, + "reward": -0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6672985553741455, + "sampling/importance_sampling_ratio/mean": 1.0004644393920898, + "sampling/importance_sampling_ratio/min": 0.5918436646461487, + "sampling/sampling_logp_difference/max": 0.524512767791748, + "sampling/sampling_logp_difference/mean": 0.01677050068974495, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 166.21875, + "completions/mean_terminated_length": 166.21875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.37008750438690186, + "epoch": 1.3492647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020667374575687864, + "kl": 0.03254357725381851, + "learning_rate": 6.733508971821036e-07, + "loss": 0.0003, + "num_tokens": 34915226.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3038923740386963, + "sampling/importance_sampling_ratio/mean": 1.0001362562179565, + "sampling/importance_sampling_ratio/min": 0.6747278571128845, + "sampling/sampling_logp_difference/max": 0.39344584941864014, + "sampling/sampling_logp_difference/mean": 0.01481970027089119, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 256.765625, + "completions/mean_terminated_length": 256.765625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.47072160243988037, + "epoch": 1.3504901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01408650788272973, + "kl": 0.026061909273266792, + "learning_rate": 6.726825272106538e-07, + "loss": 0.0002, + "num_tokens": 34949867.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3423312902450562, + "sampling/importance_sampling_ratio/mean": 1.0000892877578735, + "sampling/importance_sampling_ratio/min": 0.6159026622772217, + "sampling/sampling_logp_difference/max": 0.4846663475036621, + "sampling/sampling_logp_difference/mean": 0.014897543005645275, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 252.25, + "completions/mean_terminated_length": 252.25, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.5809694528579712, + "epoch": 1.3517156862745099, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.0102333432107318, + "kl": 0.0789414718747139, + "learning_rate": 6.720138067046134e-07, + "loss": -0.0356, + "num_tokens": 34982715.0, + "reward": -0.25, + "reward_std": 0.5765564441680908, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.3394501209259033, + "sampling/importance_sampling_ratio/mean": 1.000265121459961, + "sampling/importance_sampling_ratio/min": 0.7155026793479919, + "sampling/sampling_logp_difference/max": 0.33476996421813965, + "sampling/sampling_logp_difference/mean": 0.01793723925948143, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 170.265625, + "completions/mean_terminated_length": 170.265625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.28509607911109924, + "epoch": 1.3529411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0186612970010302, + "kl": 0.035473503172397614, + "learning_rate": 6.713447370214431e-07, + "loss": 0.0003, + "num_tokens": 35008444.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4462963342666626, + "sampling/importance_sampling_ratio/mean": 1.000580072402954, + "sampling/importance_sampling_ratio/min": 0.6890444755554199, + "sampling/sampling_logp_difference/max": 0.3724493980407715, + "sampling/sampling_logp_difference/mean": 0.011612621136009693, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 186.4375, + "completions/mean_terminated_length": 186.4375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4017951190471649, + "epoch": 1.3541666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020731791424474785, + "kl": 0.03592874854803085, + "learning_rate": 6.706753195193116e-07, + "loss": 0.0004, + "num_tokens": 35036120.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3652873039245605, + "sampling/importance_sampling_ratio/mean": 0.999518871307373, + "sampling/importance_sampling_ratio/min": 0.6376883387565613, + "sampling/sampling_logp_difference/max": 0.4499056339263916, + "sampling/sampling_logp_difference/mean": 0.015169748105108738, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 162.296875, + "completions/mean_terminated_length": 162.296875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3948853611946106, + "epoch": 1.3553921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02274097173537011, + "kl": 0.04195543751120567, + "learning_rate": 6.700055555570941e-07, + "loss": 0.0004, + "num_tokens": 35063675.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.337462067604065, + "sampling/importance_sampling_ratio/mean": 0.9998512268066406, + "sampling/importance_sampling_ratio/min": 0.5934198498725891, + "sampling/sampling_logp_difference/max": 0.5218531489372253, + "sampling/sampling_logp_difference/mean": 0.015314958989620209, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 162.890625, + "completions/mean_terminated_length": 162.890625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.35850122570991516, + "epoch": 1.3566176470588236, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0855367154891349, + "kl": 0.03481462597846985, + "learning_rate": 6.693354464943688e-07, + "loss": -0.0038, + "num_tokens": 35089348.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6007455587387085, + "sampling/importance_sampling_ratio/mean": 1.0004498958587646, + "sampling/importance_sampling_ratio/min": 0.5633743405342102, + "sampling/sampling_logp_difference/max": 0.5738110542297363, + "sampling/sampling_logp_difference/mean": 0.014802731573581696, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 188.703125, + "completions/mean_terminated_length": 188.703125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.28801846504211426, + "epoch": 1.357843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01740773670001392, + "kl": 0.025060242041945457, + "learning_rate": 6.68664993691415e-07, + "loss": 0.0002, + "num_tokens": 35123761.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.586329698562622, + "sampling/importance_sampling_ratio/mean": 0.999829888343811, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.4614229202270508, + "sampling/sampling_logp_difference/mean": 0.011413703672587872, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 201.5, + "completions/mean_terminated_length": 201.5, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.39979416131973267, + "epoch": 1.3590686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01650341982688638, + "kl": 0.02971898391842842, + "learning_rate": 6.679941985092092e-07, + "loss": 0.0003, + "num_tokens": 35156673.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5042294263839722, + "sampling/importance_sampling_ratio/mean": 1.000229835510254, + "sampling/importance_sampling_ratio/min": 0.6448522210121155, + "sampling/sampling_logp_difference/max": 0.43873417377471924, + "sampling/sampling_logp_difference/mean": 0.015918847173452377, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 215.375, + "completions/mean_terminated_length": 215.375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.46098411083221436, + "epoch": 1.3602941176470589, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5147163764637823, + "kl": 0.052413132041692734, + "learning_rate": 6.673230623094231e-07, + "loss": -0.0074, + "num_tokens": 35189177.0, + "reward": 0.5, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005366802215576, + "sampling/importance_sampling_ratio/min": 0.5570670962333679, + "sampling/sampling_logp_difference/max": 0.7709481716156006, + "sampling/sampling_logp_difference/mean": 0.016182512044906616, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 302.609375, + "completions/mean_terminated_length": 302.609375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.42158064246177673, + "epoch": 1.3615196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9139938155662435, + "kl": 0.04148112237453461, + "learning_rate": 6.666515864544208e-07, + "loss": 0.02, + "num_tokens": 35225936.0, + "reward": 0.1875, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.4334468841552734, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.7078362703323364, + "sampling/sampling_logp_difference/max": 0.36008191108703613, + "sampling/sampling_logp_difference/mean": 0.014339843764901161, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 177.84375, + "completions/mean_terminated_length": 177.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4696926176548004, + "epoch": 1.3627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9144773961228163, + "kl": 0.06416451930999756, + "learning_rate": 6.659797723072558e-07, + "loss": 0.0167, + "num_tokens": 35257430.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.3812090158462524, + "sampling/importance_sampling_ratio/mean": 1.0007481575012207, + "sampling/importance_sampling_ratio/min": 0.7125915884971619, + "sampling/sampling_logp_difference/max": 0.33884692192077637, + "sampling/sampling_logp_difference/mean": 0.017195500433444977, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 225.984375, + "completions/mean_terminated_length": 225.984375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.36867815256118774, + "epoch": 1.3639705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6677750693593473, + "kl": 0.0362837016582489, + "learning_rate": 6.653076212316681e-07, + "loss": -0.0033, + "num_tokens": 35291461.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4058878421783447, + "sampling/importance_sampling_ratio/mean": 0.9996570944786072, + "sampling/importance_sampling_ratio/min": 0.7330043911933899, + "sampling/sampling_logp_difference/max": 0.34066903591156006, + "sampling/sampling_logp_difference/mean": 0.013059025630354881, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 204.609375, + "completions/mean_terminated_length": 204.609375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3267783224582672, + "epoch": 1.3651960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019101102209227092, + "kl": 0.02914278395473957, + "learning_rate": 6.646351345920818e-07, + "loss": 0.0003, + "num_tokens": 35323244.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4072213172912598, + "sampling/importance_sampling_ratio/mean": 1.0003342628479004, + "sampling/importance_sampling_ratio/min": 0.6636192202568054, + "sampling/sampling_logp_difference/max": 0.41004669666290283, + "sampling/sampling_logp_difference/mean": 0.013215331360697746, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 222.109375, + "completions/mean_terminated_length": 222.109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3697851896286011, + "epoch": 1.366421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027466954235500943, + "kl": 0.03940945118665695, + "learning_rate": 6.639623137536022e-07, + "loss": 0.0004, + "num_tokens": 35353187.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4167759418487549, + "sampling/importance_sampling_ratio/mean": 0.9999490976333618, + "sampling/importance_sampling_ratio/min": 0.7221565246582031, + "sampling/sampling_logp_difference/max": 0.34838390350341797, + "sampling/sampling_logp_difference/mean": 0.013105101883411407, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 197.53125, + "completions/mean_terminated_length": 197.53125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3024361729621887, + "epoch": 1.3676470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019841543706668845, + "kl": 0.032502297312021255, + "learning_rate": 6.63289160082013e-07, + "loss": 0.0003, + "num_tokens": 35381221.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5686460733413696, + "sampling/importance_sampling_ratio/mean": 1.0004857778549194, + "sampling/importance_sampling_ratio/min": 0.45635396242141724, + "sampling/sampling_logp_difference/max": 0.7844865322113037, + "sampling/sampling_logp_difference/mean": 0.013303949497640133, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 188.265625, + "completions/mean_terminated_length": 188.265625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.43511879444122314, + "epoch": 1.3688725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01976911540588862, + "kl": 0.03235086798667908, + "learning_rate": 6.626156749437736e-07, + "loss": 0.0003, + "num_tokens": 35410870.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4674023389816284, + "sampling/importance_sampling_ratio/mean": 0.9996806979179382, + "sampling/importance_sampling_ratio/min": 0.5858292579650879, + "sampling/sampling_logp_difference/max": 0.5347268581390381, + "sampling/sampling_logp_difference/mean": 0.015992164611816406, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 190.765625, + "completions/mean_terminated_length": 190.765625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.2932688593864441, + "epoch": 1.3700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6935468427767087, + "kl": 0.04061746224761009, + "learning_rate": 6.619418597060159e-07, + "loss": -0.0117, + "num_tokens": 35440487.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 1.0001976490020752, + "sampling/importance_sampling_ratio/min": 0.6843438148498535, + "sampling/sampling_logp_difference/max": 0.4682176113128662, + "sampling/sampling_logp_difference/mean": 0.012141115963459015, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 167.03125, + "completions/mean_terminated_length": 167.03125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3596198558807373, + "epoch": 1.3713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023028208968430264, + "kl": 0.035580575466156006, + "learning_rate": 6.612677157365425e-07, + "loss": 0.0003, + "num_tokens": 35468217.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3988498449325562, + "sampling/importance_sampling_ratio/mean": 1.000089168548584, + "sampling/importance_sampling_ratio/min": 0.6547345519065857, + "sampling/sampling_logp_difference/max": 0.42352545261383057, + "sampling/sampling_logp_difference/mean": 0.014147953130304813, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 215.78125, + "completions/mean_terminated_length": 215.78125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.4859886169433594, + "epoch": 1.3725490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.777610592194121, + "kl": 0.05148335546255112, + "learning_rate": 6.605932444038228e-07, + "loss": 0.0142, + "num_tokens": 35498235.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.455161452293396, + "sampling/importance_sampling_ratio/mean": 1.0001635551452637, + "sampling/importance_sampling_ratio/min": 0.6208274364471436, + "sampling/sampling_logp_difference/max": 0.4767022132873535, + "sampling/sampling_logp_difference/mean": 0.017418760806322098, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 224.53125, + "completions/mean_terminated_length": 224.53125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.38453611731529236, + "epoch": 1.3737745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0369864012470773, + "kl": 0.042825035750865936, + "learning_rate": 6.599184470769908e-07, + "loss": -0.0082, + "num_tokens": 35527037.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5212843418121338, + "sampling/importance_sampling_ratio/mean": 0.9996752738952637, + "sampling/importance_sampling_ratio/min": 0.6630414128303528, + "sampling/sampling_logp_difference/max": 0.4195549488067627, + "sampling/sampling_logp_difference/mean": 0.01430191844701767, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 227.296875, + "completions/mean_terminated_length": 227.296875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3261297345161438, + "epoch": 1.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053828586160433615, + "kl": 0.03368963301181793, + "learning_rate": 6.592433251258422e-07, + "loss": 0.0003, + "num_tokens": 35563776.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.434326410293579, + "sampling/importance_sampling_ratio/mean": 0.999552309513092, + "sampling/importance_sampling_ratio/min": 0.5676478147506714, + "sampling/sampling_logp_difference/max": 0.5662540197372437, + "sampling/sampling_logp_difference/mean": 0.0120453592389822, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 212.828125, + "completions/mean_terminated_length": 212.828125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3626338541507721, + "epoch": 1.3762254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01771876735195674, + "kl": 0.03627105802297592, + "learning_rate": 6.58567879920832e-07, + "loss": 0.0003, + "num_tokens": 35595109.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5750272274017334, + "sampling/importance_sampling_ratio/mean": 1.0000033378601074, + "sampling/importance_sampling_ratio/min": 0.6057386994361877, + "sampling/sampling_logp_difference/max": 0.5013065338134766, + "sampling/sampling_logp_difference/mean": 0.014857176691293716, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 156.8125, + "completions/mean_terminated_length": 156.8125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.28793010115623474, + "epoch": 1.3774509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02618859955521471, + "kl": 0.03437525033950806, + "learning_rate": 6.578921128330714e-07, + "loss": 0.0003, + "num_tokens": 35619017.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5308873653411865, + "sampling/importance_sampling_ratio/mean": 0.9994142055511475, + "sampling/importance_sampling_ratio/min": 0.6210438013076782, + "sampling/sampling_logp_difference/max": 0.47635364532470703, + "sampling/sampling_logp_difference/mean": 0.012552684172987938, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 196.609375, + "completions/mean_terminated_length": 196.609375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.42831873893737793, + "epoch": 1.3786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018212638858156226, + "kl": 0.036771561950445175, + "learning_rate": 6.572160252343242e-07, + "loss": 0.0003, + "num_tokens": 35652368.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5762969255447388, + "sampling/importance_sampling_ratio/mean": 1.0003396272659302, + "sampling/importance_sampling_ratio/min": 0.6428821682929993, + "sampling/sampling_logp_difference/max": 0.4550783634185791, + "sampling/sampling_logp_difference/mean": 0.0155325997620821, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 173.890625, + "completions/mean_terminated_length": 173.890625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.45053666830062866, + "epoch": 1.3799019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025006269869773442, + "kl": 0.04868462681770325, + "learning_rate": 6.565396184970059e-07, + "loss": 0.0005, + "num_tokens": 35683641.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.283663034439087, + "sampling/importance_sampling_ratio/mean": 0.9999324083328247, + "sampling/importance_sampling_ratio/min": 0.6247842907905579, + "sampling/sampling_logp_difference/max": 0.4703488349914551, + "sampling/sampling_logp_difference/mean": 0.017172731459140778, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 222.09375, + "completions/mean_terminated_length": 222.09375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3818378448486328, + "epoch": 1.3811274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014529073554556921, + "kl": 0.03208887577056885, + "learning_rate": 6.558628939941791e-07, + "loss": 0.0003, + "num_tokens": 35720255.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3865206241607666, + "sampling/importance_sampling_ratio/mean": 1.0001908540725708, + "sampling/importance_sampling_ratio/min": 0.6459605097770691, + "sampling/sampling_logp_difference/max": 0.43701696395874023, + "sampling/sampling_logp_difference/mean": 0.0143346032127738, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 239.890625, + "completions/mean_terminated_length": 239.890625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.5276911854743958, + "epoch": 1.3823529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8103121218514556, + "kl": 0.045806415379047394, + "learning_rate": 6.551858530995517e-07, + "loss": -0.0098, + "num_tokens": 35756152.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.465988039970398, + "sampling/importance_sampling_ratio/mean": 1.000345230102539, + "sampling/importance_sampling_ratio/min": 0.5677291750907898, + "sampling/sampling_logp_difference/max": 0.5661107301712036, + "sampling/sampling_logp_difference/mean": 0.015663184225559235, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 212.484375, + "completions/mean_terminated_length": 212.484375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4683011770248413, + "epoch": 1.383578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01798580085049312, + "kl": 0.030934587121009827, + "learning_rate": 6.545084971874736e-07, + "loss": 0.0003, + "num_tokens": 35790439.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.290934443473816, + "sampling/importance_sampling_ratio/mean": 0.9998828768730164, + "sampling/importance_sampling_ratio/min": 0.6298438310623169, + "sampling/sampling_logp_difference/max": 0.4622833728790283, + "sampling/sampling_logp_difference/mean": 0.01621684618294239, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 236.390625, + "completions/mean_terminated_length": 236.390625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3739738464355469, + "epoch": 1.3848039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016134855144591998, + "kl": 0.028364058583974838, + "learning_rate": 6.538308276329349e-07, + "loss": 0.0003, + "num_tokens": 35825600.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4213730096817017, + "sampling/importance_sampling_ratio/mean": 1.0001099109649658, + "sampling/importance_sampling_ratio/min": 0.6817567348480225, + "sampling/sampling_logp_difference/max": 0.38308238983154297, + "sampling/sampling_logp_difference/mean": 0.012722737155854702, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 180.484375, + "completions/mean_terminated_length": 180.484375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.4150697886943817, + "epoch": 1.3860294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02778097487537663, + "kl": 0.055892542004585266, + "learning_rate": 6.531528458115614e-07, + "loss": 0.0005, + "num_tokens": 35852735.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3184069395065308, + "sampling/importance_sampling_ratio/mean": 0.9997545480728149, + "sampling/importance_sampling_ratio/min": 0.6922799944877625, + "sampling/sampling_logp_difference/max": 0.36776483058929443, + "sampling/sampling_logp_difference/mean": 0.014705037698149681, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 214.015625, + "completions/mean_terminated_length": 214.015625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.38773012161254883, + "epoch": 1.3872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018687893142765078, + "kl": 0.03789406269788742, + "learning_rate": 6.524745530996136e-07, + "loss": 0.0004, + "num_tokens": 35885040.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.381638526916504, + "sampling/importance_sampling_ratio/mean": 1.0002174377441406, + "sampling/importance_sampling_ratio/min": 0.7143754959106445, + "sampling/sampling_logp_difference/max": 0.3363466262817383, + "sampling/sampling_logp_difference/mean": 0.013985749334096909, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 182.1875, + "completions/mean_terminated_length": 182.1875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3540676236152649, + "epoch": 1.3884803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01958145904747897, + "kl": 0.03956735134124756, + "learning_rate": 6.517959508739825e-07, + "loss": 0.0004, + "num_tokens": 35915516.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6641453504562378, + "sampling/importance_sampling_ratio/mean": 0.9995432496070862, + "sampling/importance_sampling_ratio/min": 0.6927388906478882, + "sampling/sampling_logp_difference/max": 0.5093116760253906, + "sampling/sampling_logp_difference/mean": 0.013443275354802608, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 238.46875, + "completions/mean_terminated_length": 238.46875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3899323344230652, + "epoch": 1.3897058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014861637691700475, + "kl": 0.03583185374736786, + "learning_rate": 6.511170405121877e-07, + "loss": 0.0003, + "num_tokens": 35947722.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.627052664756775, + "sampling/importance_sampling_ratio/mean": 0.9995888471603394, + "sampling/importance_sampling_ratio/min": 0.7298383712768555, + "sampling/sampling_logp_difference/max": 0.4867701530456543, + "sampling/sampling_logp_difference/mean": 0.01395932212471962, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 167.21875, + "completions/mean_terminated_length": 167.21875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3784646689891815, + "epoch": 1.3909313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8825164568974678, + "kl": 0.058056920766830444, + "learning_rate": 6.504378233923742e-07, + "loss": 0.0037, + "num_tokens": 35972984.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5136674642562866, + "sampling/importance_sampling_ratio/mean": 1.000325083732605, + "sampling/importance_sampling_ratio/min": 0.6269902586936951, + "sampling/sampling_logp_difference/max": 0.4668242931365967, + "sampling/sampling_logp_difference/mean": 0.015267467126250267, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 200.921875, + "completions/mean_terminated_length": 200.921875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4866487383842468, + "epoch": 1.392156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020093671919080434, + "kl": 0.044742751866579056, + "learning_rate": 6.497583008933097e-07, + "loss": 0.0004, + "num_tokens": 36003939.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3623809814453125, + "sampling/importance_sampling_ratio/mean": 0.9999260306358337, + "sampling/importance_sampling_ratio/min": 0.6157806515693665, + "sampling/sampling_logp_difference/max": 0.4848644733428955, + "sampling/sampling_logp_difference/mean": 0.016255199909210205, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 202.6875, + "completions/mean_terminated_length": 202.6875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.341106116771698, + "epoch": 1.3933823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018811027989333717, + "kl": 0.0390816330909729, + "learning_rate": 6.490784743943818e-07, + "loss": 0.0004, + "num_tokens": 36032239.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6553131341934204, + "sampling/importance_sampling_ratio/mean": 1.0006049871444702, + "sampling/importance_sampling_ratio/min": 0.7228627800941467, + "sampling/sampling_logp_difference/max": 0.5039901733398438, + "sampling/sampling_logp_difference/mean": 0.013343091122806072, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 203.171875, + "completions/mean_terminated_length": 203.171875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.4166279137134552, + "epoch": 1.3946078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0201380357592192, + "kl": 0.03564707189798355, + "learning_rate": 6.483983452755952e-07, + "loss": 0.0003, + "num_tokens": 36067290.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4581257104873657, + "sampling/importance_sampling_ratio/mean": 1.0003366470336914, + "sampling/importance_sampling_ratio/min": 0.6117782592773438, + "sampling/sampling_logp_difference/max": 0.49138545989990234, + "sampling/sampling_logp_difference/mean": 0.014345650561153889, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 236.875, + "completions/mean_terminated_length": 236.875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4097669720649719, + "epoch": 1.3958333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015900443786307623, + "kl": 0.03139366954565048, + "learning_rate": 6.477179149175692e-07, + "loss": 0.0003, + "num_tokens": 36104642.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 0.9999328255653381, + "sampling/importance_sampling_ratio/min": 0.6221033930778503, + "sampling/sampling_logp_difference/max": 0.48689985275268555, + "sampling/sampling_logp_difference/mean": 0.014695117250084877, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 212.875, + "completions/mean_terminated_length": 212.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5011221170425415, + "epoch": 1.3970588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017791330374902426, + "kl": 0.04139325022697449, + "learning_rate": 6.470371847015341e-07, + "loss": 0.0004, + "num_tokens": 36140154.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3758145570755005, + "sampling/importance_sampling_ratio/mean": 1.0001449584960938, + "sampling/importance_sampling_ratio/min": 0.6445356011390686, + "sampling/sampling_logp_difference/max": 0.4392251968383789, + "sampling/sampling_logp_difference/mean": 0.016765639185905457, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 212.125, + "completions/mean_terminated_length": 212.125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4664650559425354, + "epoch": 1.3982843137254901, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.871845786759249, + "kl": 0.045797113329172134, + "learning_rate": 6.463561560093292e-07, + "loss": -0.0076, + "num_tokens": 36172914.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.572105050086975, + "sampling/importance_sampling_ratio/mean": 1.0001112222671509, + "sampling/importance_sampling_ratio/min": 0.6616653800010681, + "sampling/sampling_logp_difference/max": 0.45241546630859375, + "sampling/sampling_logp_difference/mean": 0.016561444848775864, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 653.0, + "completions/max_terminated_length": 653.0, + "completions/mean_length": 294.265625, + "completions/mean_terminated_length": 294.265625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3774057626724243, + "epoch": 1.3995098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013212440900637555, + "kl": 0.024201175197958946, + "learning_rate": 6.456748302233994e-07, + "loss": 0.0002, + "num_tokens": 36209235.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5392427444458008, + "sampling/importance_sampling_ratio/mean": 1.0007257461547852, + "sampling/importance_sampling_ratio/min": 0.7199281454086304, + "sampling/sampling_logp_difference/max": 0.4312906265258789, + "sampling/sampling_logp_difference/mean": 0.012639081105589867, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 163.484375, + "completions/mean_terminated_length": 163.484375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.4590701460838318, + "epoch": 1.4007352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2703788828748115, + "kl": 0.050560418516397476, + "learning_rate": 6.449932087267931e-07, + "loss": -0.0102, + "num_tokens": 36235458.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.402066946029663, + "sampling/importance_sampling_ratio/mean": 1.0001840591430664, + "sampling/importance_sampling_ratio/min": 0.6709317564964294, + "sampling/sampling_logp_difference/max": 0.39908790588378906, + "sampling/sampling_logp_difference/mean": 0.0171342883259058, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 176.359375, + "completions/mean_terminated_length": 176.359375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.38662445545196533, + "epoch": 1.4019607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017236828833540965, + "kl": 0.037596482783555984, + "learning_rate": 6.443112929031586e-07, + "loss": 0.0003, + "num_tokens": 36261545.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3856422901153564, + "sampling/importance_sampling_ratio/mean": 0.9996615648269653, + "sampling/importance_sampling_ratio/min": 0.6394821405410767, + "sampling/sampling_logp_difference/max": 0.447096586227417, + "sampling/sampling_logp_difference/mean": 0.014809362590312958, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 220.5, + "completions/mean_terminated_length": 220.5, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.40431851148605347, + "epoch": 1.403186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021331365625703074, + "kl": 0.03596138954162598, + "learning_rate": 6.43629084136742e-07, + "loss": 0.0003, + "num_tokens": 36293065.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6057169437408447, + "sampling/importance_sampling_ratio/mean": 1.0001697540283203, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.4735703468322754, + "sampling/sampling_logp_difference/mean": 0.014951720833778381, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 207.734375, + "completions/mean_terminated_length": 207.734375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4531485140323639, + "epoch": 1.4044117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018203394563173974, + "kl": 0.04105984419584274, + "learning_rate": 6.429465838123838e-07, + "loss": 0.0004, + "num_tokens": 36323656.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6343212127685547, + "sampling/importance_sampling_ratio/mean": 0.9997382760047913, + "sampling/importance_sampling_ratio/min": 0.6331372857093811, + "sampling/sampling_logp_difference/max": 0.4912276268005371, + "sampling/sampling_logp_difference/mean": 0.01589648798108101, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 259.3125, + "completions/mean_terminated_length": 259.3125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.3934080898761749, + "epoch": 1.405637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020914130194614033, + "kl": 0.035692743957042694, + "learning_rate": 6.422637933155162e-07, + "loss": 0.0003, + "num_tokens": 36358460.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2907326221466064, + "sampling/importance_sampling_ratio/mean": 0.9999151229858398, + "sampling/importance_sampling_ratio/min": 0.675743579864502, + "sampling/sampling_logp_difference/max": 0.39194154739379883, + "sampling/sampling_logp_difference/mean": 0.014120127074420452, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 152.8125, + "completions/mean_terminated_length": 152.8125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.29732319712638855, + "epoch": 1.406862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020387493413349046, + "kl": 0.03376832604408264, + "learning_rate": 6.41580714032161e-07, + "loss": 0.0003, + "num_tokens": 36381648.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5645488500595093, + "sampling/importance_sampling_ratio/mean": 1.0004475116729736, + "sampling/importance_sampling_ratio/min": 0.6080793738365173, + "sampling/sampling_logp_difference/max": 0.4974498748779297, + "sampling/sampling_logp_difference/mean": 0.012887522578239441, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 177.53125, + "completions/mean_terminated_length": 177.53125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.41781967878341675, + "epoch": 1.4080882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02404443127113517, + "kl": 0.04842346906661987, + "learning_rate": 6.408973473489257e-07, + "loss": 0.0004, + "num_tokens": 36408722.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8529335260391235, + "sampling/importance_sampling_ratio/mean": 0.9999231696128845, + "sampling/importance_sampling_ratio/min": 0.6887311339378357, + "sampling/sampling_logp_difference/max": 0.6167700290679932, + "sampling/sampling_logp_difference/mean": 0.015981681644916534, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 190.0, + "completions/mean_terminated_length": 190.0, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4293907880783081, + "epoch": 1.409313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02367845774403614, + "kl": 0.045156046748161316, + "learning_rate": 6.402136946530014e-07, + "loss": 0.0004, + "num_tokens": 36440946.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4513518810272217, + "sampling/importance_sampling_ratio/mean": 1.0003032684326172, + "sampling/importance_sampling_ratio/min": 0.6622359752655029, + "sampling/sampling_logp_difference/max": 0.4121333360671997, + "sampling/sampling_logp_difference/mean": 0.01704506203532219, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 224.109375, + "completions/mean_terminated_length": 224.109375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4653753340244293, + "epoch": 1.4105392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9583863934305666, + "kl": 0.040310852229595184, + "learning_rate": 6.395297573321597e-07, + "loss": 0.041, + "num_tokens": 36470825.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.656188726425171, + "sampling/importance_sampling_ratio/mean": 1.0000224113464355, + "sampling/importance_sampling_ratio/min": 0.6066967248916626, + "sampling/sampling_logp_difference/max": 0.504518985748291, + "sampling/sampling_logp_difference/mean": 0.01696554198861122, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 259.203125, + "completions/mean_terminated_length": 259.203125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.4103810787200928, + "epoch": 1.4117647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016200855867439422, + "kl": 0.027447929605841637, + "learning_rate": 6.388455367747502e-07, + "loss": 0.0003, + "num_tokens": 36506230.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4783718585968018, + "sampling/importance_sampling_ratio/mean": 1.0000253915786743, + "sampling/importance_sampling_ratio/min": 0.7335873246192932, + "sampling/sampling_logp_difference/max": 0.3909413814544678, + "sampling/sampling_logp_difference/mean": 0.013810326345264912, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 251.21875, + "completions/mean_terminated_length": 251.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4707592725753784, + "epoch": 1.4129901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7627371249958246, + "kl": 0.04826528578996658, + "learning_rate": 6.38161034369697e-07, + "loss": -0.0081, + "num_tokens": 36539972.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.502083420753479, + "sampling/importance_sampling_ratio/mean": 0.9995489716529846, + "sampling/importance_sampling_ratio/min": 0.5256656408309937, + "sampling/sampling_logp_difference/max": 0.6430898904800415, + "sampling/sampling_logp_difference/mean": 0.015454958193004131, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 201.421875, + "completions/mean_terminated_length": 201.421875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3892950415611267, + "epoch": 1.4142156862745099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017690955857901577, + "kl": 0.028931012377142906, + "learning_rate": 6.37476251506497e-07, + "loss": 0.0003, + "num_tokens": 36568895.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4651638269424438, + "sampling/importance_sampling_ratio/mean": 1.0001156330108643, + "sampling/importance_sampling_ratio/min": 0.649138331413269, + "sampling/sampling_logp_difference/max": 0.43210935592651367, + "sampling/sampling_logp_difference/mean": 0.013972951099276543, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 202.6875, + "completions/mean_terminated_length": 202.6875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.519242525100708, + "epoch": 1.4154411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02135539836263351, + "kl": 0.04262981563806534, + "learning_rate": 6.367911895752158e-07, + "loss": 0.0004, + "num_tokens": 36603019.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4156533479690552, + "sampling/importance_sampling_ratio/mean": 0.9998533725738525, + "sampling/importance_sampling_ratio/min": 0.6932653784751892, + "sampling/sampling_logp_difference/max": 0.3663424253463745, + "sampling/sampling_logp_difference/mean": 0.016897693276405334, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 230.453125, + "completions/mean_terminated_length": 230.453125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.416151762008667, + "epoch": 1.4166666666666667, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0189186992301897, + "kl": 0.03677485138177872, + "learning_rate": 6.361058499664855e-07, + "loss": 0.0004, + "num_tokens": 36638200.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6617846488952637, + "sampling/importance_sampling_ratio/mean": 0.9995614886283875, + "sampling/importance_sampling_ratio/min": 0.6192152500152588, + "sampling/sampling_logp_difference/max": 0.5078921318054199, + "sampling/sampling_logp_difference/mean": 0.014120293781161308, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 197.796875, + "completions/mean_terminated_length": 197.796875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.47481685876846313, + "epoch": 1.4178921568627452, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7106213381086652, + "kl": 0.039433401077985764, + "learning_rate": 6.354202340715026e-07, + "loss": -0.001, + "num_tokens": 36669771.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.373407006263733, + "sampling/importance_sampling_ratio/mean": 0.999471127986908, + "sampling/importance_sampling_ratio/min": 0.6230600476264954, + "sampling/sampling_logp_difference/max": 0.4731123447418213, + "sampling/sampling_logp_difference/mean": 0.016685422509908676, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 182.328125, + "completions/mean_terminated_length": 182.328125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3988850712776184, + "epoch": 1.4191176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021343238010666843, + "kl": 0.04164959117770195, + "learning_rate": 6.347343432820234e-07, + "loss": 0.0004, + "num_tokens": 36701792.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3889849185943604, + "sampling/importance_sampling_ratio/mean": 1.000058889389038, + "sampling/importance_sampling_ratio/min": 0.6187108159065247, + "sampling/sampling_logp_difference/max": 0.4801173210144043, + "sampling/sampling_logp_difference/mean": 0.015087027102708817, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 182.046875, + "completions/mean_terminated_length": 182.046875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4099763035774231, + "epoch": 1.420343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018067353414028858, + "kl": 0.034441027790308, + "learning_rate": 6.340481789903634e-07, + "loss": 0.0003, + "num_tokens": 36737171.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6580297946929932, + "sampling/importance_sampling_ratio/mean": 0.9997607469558716, + "sampling/importance_sampling_ratio/min": 0.7133997082710266, + "sampling/sampling_logp_difference/max": 0.5056300163269043, + "sampling/sampling_logp_difference/mean": 0.014864427037537098, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 200.40625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4773097634315491, + "epoch": 1.4215686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0293788881414854, + "kl": 0.0406605489552021, + "learning_rate": 6.333617425893919e-07, + "loss": -0.0233, + "num_tokens": 36765997.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.3286681175231934, + "sampling/importance_sampling_ratio/mean": 1.0002083778381348, + "sampling/importance_sampling_ratio/min": 0.6262644529342651, + "sampling/sampling_logp_difference/max": 0.46798253059387207, + "sampling/sampling_logp_difference/mean": 0.01621146872639656, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 171.65625, + "completions/mean_terminated_length": 171.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.37978655099868774, + "epoch": 1.4227941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018114489749995775, + "kl": 0.03288944438099861, + "learning_rate": 6.326750354725319e-07, + "loss": 0.0003, + "num_tokens": 36794743.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.800256371498108, + "sampling/importance_sampling_ratio/mean": 1.0006855726242065, + "sampling/importance_sampling_ratio/min": 0.617500901222229, + "sampling/sampling_logp_difference/max": 0.5879291296005249, + "sampling/sampling_logp_difference/mean": 0.015034069307148457, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 213.265625, + "completions/mean_terminated_length": 213.265625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.44157570600509644, + "epoch": 1.4240196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015580433843617932, + "kl": 0.026813317090272903, + "learning_rate": 6.319880590337548e-07, + "loss": 0.0003, + "num_tokens": 36827144.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000941753387451, + "sampling/importance_sampling_ratio/min": 0.6778589487075806, + "sampling/sampling_logp_difference/max": 0.8838214874267578, + "sampling/sampling_logp_difference/mean": 0.016190864145755768, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 150.84375, + "completions/mean_terminated_length": 150.84375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4329409599304199, + "epoch": 1.4252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03626490405225596, + "kl": 0.04385094344615936, + "learning_rate": 6.313008146675799e-07, + "loss": 0.0004, + "num_tokens": 36856030.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.393547773361206, + "sampling/importance_sampling_ratio/mean": 1.0001081228256226, + "sampling/importance_sampling_ratio/min": 0.6501546502113342, + "sampling/sampling_logp_difference/max": 0.4305450916290283, + "sampling/sampling_logp_difference/mean": 0.017302222549915314, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 232.28125, + "completions/mean_terminated_length": 232.28125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.33402836322784424, + "epoch": 1.4264705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012219846207602742, + "kl": 0.02296830713748932, + "learning_rate": 6.306133037690692e-07, + "loss": 0.0002, + "num_tokens": 36888976.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3758339881896973, + "sampling/importance_sampling_ratio/mean": 1.0000852346420288, + "sampling/importance_sampling_ratio/min": 0.6108908653259277, + "sampling/sampling_logp_difference/max": 0.49283695220947266, + "sampling/sampling_logp_difference/mean": 0.012221208773553371, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 189.703125, + "completions/mean_terminated_length": 189.703125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3288557827472687, + "epoch": 1.4276960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018641186944606498, + "kl": 0.0329284593462944, + "learning_rate": 6.299255277338264e-07, + "loss": 0.0003, + "num_tokens": 36920125.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4268949031829834, + "sampling/importance_sampling_ratio/mean": 1.0001251697540283, + "sampling/importance_sampling_ratio/min": 0.6372838616371155, + "sampling/sampling_logp_difference/max": 0.45054006576538086, + "sampling/sampling_logp_difference/mean": 0.01316780410706997, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 188.265625, + "completions/mean_terminated_length": 188.265625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3560446500778198, + "epoch": 1.428921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0224097862378743, + "kl": 0.0396597683429718, + "learning_rate": 6.292374879579934e-07, + "loss": 0.0004, + "num_tokens": 36946382.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4545938968658447, + "sampling/importance_sampling_ratio/mean": 1.0007069110870361, + "sampling/importance_sampling_ratio/min": 0.6191086173057556, + "sampling/sampling_logp_difference/max": 0.4794745445251465, + "sampling/sampling_logp_difference/mean": 0.013722263276576996, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 210.453125, + "completions/mean_terminated_length": 210.453125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.40921494364738464, + "epoch": 1.4301470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022115861694415706, + "kl": 0.03952673822641373, + "learning_rate": 6.285491858382473e-07, + "loss": 0.0004, + "num_tokens": 36979387.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4917949438095093, + "sampling/importance_sampling_ratio/mean": 1.0006740093231201, + "sampling/importance_sampling_ratio/min": 0.7042884230613708, + "sampling/sampling_logp_difference/max": 0.3999800682067871, + "sampling/sampling_logp_difference/mean": 0.015281391330063343, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 258.984375, + "completions/mean_terminated_length": 258.984375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.473724901676178, + "epoch": 1.4313725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022753461567393787, + "kl": 0.04564649984240532, + "learning_rate": 6.278606227717978e-07, + "loss": 0.0004, + "num_tokens": 37017994.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.27568781375885, + "sampling/importance_sampling_ratio/mean": 0.9997217059135437, + "sampling/importance_sampling_ratio/min": 0.6260144114494324, + "sampling/sampling_logp_difference/max": 0.4683818817138672, + "sampling/sampling_logp_difference/mean": 0.015333606861531734, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 209.078125, + "completions/mean_terminated_length": 209.078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4920414090156555, + "epoch": 1.4325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01657223631128751, + "kl": 0.035376884043216705, + "learning_rate": 6.271718001563843e-07, + "loss": 0.0003, + "num_tokens": 37047711.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4162039756774902, + "sampling/importance_sampling_ratio/mean": 0.9997128248214722, + "sampling/importance_sampling_ratio/min": 0.6210846304893494, + "sampling/sampling_logp_difference/max": 0.476287841796875, + "sampling/sampling_logp_difference/mean": 0.016096480190753937, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 221.890625, + "completions/mean_terminated_length": 221.890625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4462301433086395, + "epoch": 1.4338235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8204554633995234, + "kl": 0.04333376884460449, + "learning_rate": 6.264827193902731e-07, + "loss": -0.0225, + "num_tokens": 37084008.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.578283667564392, + "sampling/importance_sampling_ratio/mean": 1.000231146812439, + "sampling/importance_sampling_ratio/min": 0.6549274921417236, + "sampling/sampling_logp_difference/max": 0.45633792877197266, + "sampling/sampling_logp_difference/mean": 0.01599014550447464, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 210.515625, + "completions/mean_terminated_length": 210.515625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.5151039958000183, + "epoch": 1.4350490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04422786233709827, + "kl": 0.06160306930541992, + "learning_rate": 6.257933818722542e-07, + "loss": 0.0005, + "num_tokens": 37117737.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3585052490234375, + "sampling/importance_sampling_ratio/mean": 1.0009124279022217, + "sampling/importance_sampling_ratio/min": 0.6124889850616455, + "sampling/sampling_logp_difference/max": 0.49022436141967773, + "sampling/sampling_logp_difference/mean": 0.018399199470877647, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 200.28125, + "completions/mean_terminated_length": 200.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.46506404876708984, + "epoch": 1.4362745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7820850745089246, + "kl": 0.03863566368818283, + "learning_rate": 6.251037890016395e-07, + "loss": -0.0308, + "num_tokens": 37145963.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4675662517547607, + "sampling/importance_sampling_ratio/mean": 1.000497817993164, + "sampling/importance_sampling_ratio/min": 0.7670260667800903, + "sampling/sampling_logp_difference/max": 0.38360536098480225, + "sampling/sampling_logp_difference/mean": 0.016250811517238617, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 206.34375, + "completions/mean_terminated_length": 206.34375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.41054677963256836, + "epoch": 1.4375, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6938400982338712, + "kl": 0.045754268765449524, + "learning_rate": 6.244139421782587e-07, + "loss": -0.0011, + "num_tokens": 37172913.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4826430082321167, + "sampling/importance_sampling_ratio/mean": 0.9999325275421143, + "sampling/importance_sampling_ratio/min": 0.6914175748825073, + "sampling/sampling_logp_difference/max": 0.39382636547088623, + "sampling/sampling_logp_difference/mean": 0.013678722083568573, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 237.3125, + "completions/mean_terminated_length": 237.3125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3614650070667267, + "epoch": 1.4387254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013151961876267592, + "kl": 0.02268226258456707, + "learning_rate": 6.237238428024571e-07, + "loss": 0.0002, + "num_tokens": 37207365.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5792675018310547, + "sampling/importance_sampling_ratio/mean": 1.0003628730773926, + "sampling/importance_sampling_ratio/min": 0.5141401886940002, + "sampling/sampling_logp_difference/max": 0.6652593612670898, + "sampling/sampling_logp_difference/mean": 0.014220098033547401, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 185.46875, + "completions/mean_terminated_length": 185.46875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.40694963932037354, + "epoch": 1.4399509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019817491208471438, + "kl": 0.04109745845198631, + "learning_rate": 6.230334922750929e-07, + "loss": 0.0004, + "num_tokens": 37233379.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6207282543182373, + "sampling/importance_sampling_ratio/mean": 1.00010347366333, + "sampling/importance_sampling_ratio/min": 0.6079967617988586, + "sampling/sampling_logp_difference/max": 0.4975857734680176, + "sampling/sampling_logp_difference/mean": 0.015249053947627544, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 193.9375, + "completions/mean_terminated_length": 193.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.34148937463760376, + "epoch": 1.4411764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02603340705294334, + "kl": 0.030143603682518005, + "learning_rate": 6.223428919975338e-07, + "loss": 0.0003, + "num_tokens": 37264687.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5455199480056763, + "sampling/importance_sampling_ratio/mean": 1.0002055168151855, + "sampling/importance_sampling_ratio/min": 0.6264569759368896, + "sampling/sampling_logp_difference/max": 0.46767520904541016, + "sampling/sampling_logp_difference/mean": 0.01314076129347086, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 219.09375, + "completions/mean_terminated_length": 219.09375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.48918333649635315, + "epoch": 1.4424019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.040913893462463, + "kl": 0.07333552837371826, + "learning_rate": 6.216520433716544e-07, + "loss": 0.0169, + "num_tokens": 37296421.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.434165358543396, + "sampling/importance_sampling_ratio/mean": 0.9998877644538879, + "sampling/importance_sampling_ratio/min": 0.6926529407501221, + "sampling/sampling_logp_difference/max": 0.367226243019104, + "sampling/sampling_logp_difference/mean": 0.01609702780842781, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 206.578125, + "completions/mean_terminated_length": 206.578125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.45933449268341064, + "epoch": 1.4436274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015941078259987168, + "kl": 0.03184128180146217, + "learning_rate": 6.209609477998338e-07, + "loss": 0.0003, + "num_tokens": 37330490.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6488854885101318, + "sampling/importance_sampling_ratio/mean": 1.0001170635223389, + "sampling/importance_sampling_ratio/min": 0.6173411011695862, + "sampling/sampling_logp_difference/max": 0.5000996589660645, + "sampling/sampling_logp_difference/mean": 0.015895390883088112, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 191.234375, + "completions/mean_terminated_length": 191.234375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.45986810326576233, + "epoch": 1.4448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8005740570694926, + "kl": 0.05514213815331459, + "learning_rate": 6.202696066849524e-07, + "loss": 0.0067, + "num_tokens": 37357225.0, + "reward": -0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6609363555908203, + "sampling/importance_sampling_ratio/mean": 1.000389575958252, + "sampling/importance_sampling_ratio/min": 0.6299614310264587, + "sampling/sampling_logp_difference/max": 0.5073815584182739, + "sampling/sampling_logp_difference/mean": 0.01626450940966606, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 198.90625, + "completions/mean_terminated_length": 198.90625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.4463042616844177, + "epoch": 1.446078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8882024418683567, + "kl": 0.031965792179107666, + "learning_rate": 6.195780214303887e-07, + "loss": -0.0144, + "num_tokens": 37392835.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4765907526016235, + "sampling/importance_sampling_ratio/mean": 0.9997777938842773, + "sampling/importance_sampling_ratio/min": 0.6732635498046875, + "sampling/sampling_logp_difference/max": 0.3956184387207031, + "sampling/sampling_logp_difference/mean": 0.014926549047231674, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 179.453125, + "completions/mean_terminated_length": 179.453125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4128044843673706, + "epoch": 1.4473039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0281274341082484, + "kl": 0.04153367131948471, + "learning_rate": 6.188861934400171e-07, + "loss": 0.0004, + "num_tokens": 37427808.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6002800464630127, + "sampling/importance_sampling_ratio/mean": 1.000360131263733, + "sampling/importance_sampling_ratio/min": 0.6093294620513916, + "sampling/sampling_logp_difference/max": 0.49539613723754883, + "sampling/sampling_logp_difference/mean": 0.01555293332785368, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 224.484375, + "completions/mean_terminated_length": 224.484375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4386179447174072, + "epoch": 1.4485294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015855737822550973, + "kl": 0.02755020745098591, + "learning_rate": 6.181941241182043e-07, + "loss": 0.0003, + "num_tokens": 37464927.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8085458278656006, + "sampling/importance_sampling_ratio/mean": 1.0002679824829102, + "sampling/importance_sampling_ratio/min": 0.5397461652755737, + "sampling/sampling_logp_difference/max": 0.6166563034057617, + "sampling/sampling_logp_difference/mean": 0.01591423712670803, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 183.75, + "completions/mean_terminated_length": 183.75, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.4858577251434326, + "epoch": 1.4497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0355042417143942, + "kl": 0.05320306867361069, + "learning_rate": 6.175018148698076e-07, + "loss": 0.0005, + "num_tokens": 37495119.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7035918235778809, + "sampling/importance_sampling_ratio/mean": 1.0005650520324707, + "sampling/importance_sampling_ratio/min": 0.6632429361343384, + "sampling/sampling_logp_difference/max": 0.5327389240264893, + "sampling/sampling_logp_difference/mean": 0.017133308574557304, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 209.59375, + "completions/mean_terminated_length": 209.59375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.5416671633720398, + "epoch": 1.4509803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1585298101868597, + "kl": 0.05938192084431648, + "learning_rate": 6.168092671001705e-07, + "loss": -0.0202, + "num_tokens": 37529157.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4024341106414795, + "sampling/importance_sampling_ratio/mean": 1.0000582933425903, + "sampling/importance_sampling_ratio/min": 0.6091784238815308, + "sampling/sampling_logp_difference/max": 0.49564409255981445, + "sampling/sampling_logp_difference/mean": 0.017426788806915283, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 162.890625, + "completions/mean_terminated_length": 162.890625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.45169419050216675, + "epoch": 1.4522058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02232771027440117, + "kl": 0.04014962911605835, + "learning_rate": 6.161164822151213e-07, + "loss": 0.0004, + "num_tokens": 37558126.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3685723543167114, + "sampling/importance_sampling_ratio/mean": 1.000321626663208, + "sampling/importance_sampling_ratio/min": 0.6659936904907227, + "sampling/sampling_logp_difference/max": 0.4064750671386719, + "sampling/sampling_logp_difference/mean": 0.015808694064617157, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 235.78125, + "completions/mean_terminated_length": 235.78125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3496420383453369, + "epoch": 1.4534313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015577012636220793, + "kl": 0.026788845658302307, + "learning_rate": 6.154234616209692e-07, + "loss": 0.0003, + "num_tokens": 37592784.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3528342247009277, + "sampling/importance_sampling_ratio/mean": 0.9999212026596069, + "sampling/importance_sampling_ratio/min": 0.6132686734199524, + "sampling/sampling_logp_difference/max": 0.4889521598815918, + "sampling/sampling_logp_difference/mean": 0.012509230524301529, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 208.71875, + "completions/mean_terminated_length": 208.71875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.46052321791648865, + "epoch": 1.454656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018013028851222145, + "kl": 0.02776549756526947, + "learning_rate": 6.147302067245028e-07, + "loss": 0.0003, + "num_tokens": 37622414.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6190518140792847, + "sampling/importance_sampling_ratio/mean": 0.9995525479316711, + "sampling/importance_sampling_ratio/min": 0.6144495010375977, + "sampling/sampling_logp_difference/max": 0.4870285987854004, + "sampling/sampling_logp_difference/mean": 0.015121417120099068, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 203.21875, + "completions/mean_terminated_length": 203.21875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4830366373062134, + "epoch": 1.4558823529411764, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02214156207247184, + "kl": 0.04310956969857216, + "learning_rate": 6.140367189329847e-07, + "loss": 0.0004, + "num_tokens": 37653148.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5278550386428833, + "sampling/importance_sampling_ratio/mean": 0.9998105764389038, + "sampling/importance_sampling_ratio/min": 0.7727299928665161, + "sampling/sampling_logp_difference/max": 0.42386484146118164, + "sampling/sampling_logp_difference/mean": 0.01657957024872303, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 186.40625, + "completions/mean_terminated_length": 186.40625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4567105770111084, + "epoch": 1.4571078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025514641483272565, + "kl": 0.04340451955795288, + "learning_rate": 6.133429996541518e-07, + "loss": 0.0004, + "num_tokens": 37683542.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5277611017227173, + "sampling/importance_sampling_ratio/mean": 0.9995805025100708, + "sampling/importance_sampling_ratio/min": 0.7091161608695984, + "sampling/sampling_logp_difference/max": 0.42380332946777344, + "sampling/sampling_logp_difference/mean": 0.015402511693537235, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 205.140625, + "completions/mean_terminated_length": 205.140625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.47519806027412415, + "epoch": 1.4583333333333333, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021531896020683963, + "kl": 0.04695788025856018, + "learning_rate": 6.1264905029621e-07, + "loss": 0.0004, + "num_tokens": 37718639.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.41250479221344, + "sampling/importance_sampling_ratio/mean": 1.000076413154602, + "sampling/importance_sampling_ratio/min": 0.6857278347015381, + "sampling/sampling_logp_difference/max": 0.3772745132446289, + "sampling/sampling_logp_difference/mean": 0.015646200627088547, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 192.3125, + "completions/mean_terminated_length": 192.3125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.43114298582077026, + "epoch": 1.4595588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1324860528711949, + "kl": 0.05779781565070152, + "learning_rate": 6.119548722678327e-07, + "loss": 0.075, + "num_tokens": 37752419.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4694956541061401, + "sampling/importance_sampling_ratio/mean": 0.9997105598449707, + "sampling/importance_sampling_ratio/min": 0.7481796741485596, + "sampling/sampling_logp_difference/max": 0.3849191665649414, + "sampling/sampling_logp_difference/mean": 0.014590677805244923, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 219.671875, + "completions/mean_terminated_length": 219.671875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.36435285210609436, + "epoch": 1.4607843137254901, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016175287540601237, + "kl": 0.033833034336566925, + "learning_rate": 6.112604669781572e-07, + "loss": 0.0003, + "num_tokens": 37783918.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3206326961517334, + "sampling/importance_sampling_ratio/mean": 0.9999261498451233, + "sampling/importance_sampling_ratio/min": 0.6813300251960754, + "sampling/sampling_logp_difference/max": 0.38370847702026367, + "sampling/sampling_logp_difference/mean": 0.012360158376395702, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 225.46875, + "completions/mean_terminated_length": 225.46875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.49749669432640076, + "epoch": 1.4620098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7798713022462089, + "kl": 0.04543078690767288, + "learning_rate": 6.105658358367822e-07, + "loss": -0.0268, + "num_tokens": 37817436.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3153043985366821, + "sampling/importance_sampling_ratio/mean": 1.0001451969146729, + "sampling/importance_sampling_ratio/min": 0.7269885540008545, + "sampling/sampling_logp_difference/max": 0.3188445568084717, + "sampling/sampling_logp_difference/mean": 0.01567385531961918, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 187.265625, + "completions/mean_terminated_length": 187.265625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3612132966518402, + "epoch": 1.4632352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02322135424877072, + "kl": 0.03864223137497902, + "learning_rate": 6.098709802537653e-07, + "loss": 0.0004, + "num_tokens": 37842733.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4756741523742676, + "sampling/importance_sampling_ratio/mean": 1.0002970695495605, + "sampling/importance_sampling_ratio/min": 0.6045997738838196, + "sampling/sampling_logp_difference/max": 0.5031886100769043, + "sampling/sampling_logp_difference/mean": 0.015087596140801907, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 215.828125, + "completions/mean_terminated_length": 215.828125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.41238224506378174, + "epoch": 1.4644607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018129618192082097, + "kl": 0.03457965701818466, + "learning_rate": 6.091759016396188e-07, + "loss": 0.0003, + "num_tokens": 37873906.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4744254350662231, + "sampling/importance_sampling_ratio/mean": 1.0000766515731812, + "sampling/importance_sampling_ratio/min": 0.7046488523483276, + "sampling/sampling_logp_difference/max": 0.3882683515548706, + "sampling/sampling_logp_difference/mean": 0.014175968244671822, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 171.96875, + "completions/mean_terminated_length": 171.96875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.44503602385520935, + "epoch": 1.465686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.841634051797968, + "kl": 0.05202612280845642, + "learning_rate": 6.084806014053086e-07, + "loss": 0.0064, + "num_tokens": 37900976.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4928193092346191, + "sampling/importance_sampling_ratio/mean": 0.9997214078903198, + "sampling/importance_sampling_ratio/min": 0.5696046352386475, + "sampling/sampling_logp_difference/max": 0.5628128051757812, + "sampling/sampling_logp_difference/mean": 0.015186592936515808, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 207.109375, + "completions/mean_terminated_length": 207.109375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.31114035844802856, + "epoch": 1.4669117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02005669378796282, + "kl": 0.033173758536577225, + "learning_rate": 6.077850809622498e-07, + "loss": 0.0003, + "num_tokens": 37931767.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2876096963882446, + "sampling/importance_sampling_ratio/mean": 0.9997863173484802, + "sampling/importance_sampling_ratio/min": 0.6910658478736877, + "sampling/sampling_logp_difference/max": 0.3695201873779297, + "sampling/sampling_logp_difference/mean": 0.011576864868402481, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 166.953125, + "completions/mean_terminated_length": 166.953125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5329830050468445, + "epoch": 1.468137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2504402916643644, + "kl": 0.09681343287229538, + "learning_rate": 6.070893417223052e-07, + "loss": -0.017, + "num_tokens": 37956324.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.568365216255188, + "sampling/importance_sampling_ratio/mean": 0.9999660849571228, + "sampling/importance_sampling_ratio/min": 0.6802662014961243, + "sampling/sampling_logp_difference/max": 0.45003390312194824, + "sampling/sampling_logp_difference/mean": 0.016995804384350777, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 606.0, + "completions/max_terminated_length": 606.0, + "completions/mean_length": 226.265625, + "completions/mean_terminated_length": 226.265625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.43442896008491516, + "epoch": 1.469362745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1722368189735048, + "kl": 0.04702545702457428, + "learning_rate": 6.06393385097781e-07, + "loss": -0.0327, + "num_tokens": 37990965.0, + "reward": 0.0625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.527762532234192, + "sampling/importance_sampling_ratio/mean": 1.0005501508712769, + "sampling/importance_sampling_ratio/min": 0.6057447791099548, + "sampling/sampling_logp_difference/max": 0.5012965202331543, + "sampling/sampling_logp_difference/mean": 0.013631860725581646, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 171.875, + "completions/mean_terminated_length": 171.875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3700704276561737, + "epoch": 1.4705882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0374565572533946, + "kl": 0.04746583104133606, + "learning_rate": 6.056972125014254e-07, + "loss": 0.0005, + "num_tokens": 38019325.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4623384475708008, + "sampling/importance_sampling_ratio/mean": 1.0001246929168701, + "sampling/importance_sampling_ratio/min": 0.48418575525283813, + "sampling/sampling_logp_difference/max": 0.7252867221832275, + "sampling/sampling_logp_difference/mean": 0.01417962834239006, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 229.296875, + "completions/mean_terminated_length": 229.296875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.47815942764282227, + "epoch": 1.471813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7273693398303055, + "kl": 0.047664448618888855, + "learning_rate": 6.050008253464246e-07, + "loss": -0.05, + "num_tokens": 38052688.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.627287745475769, + "sampling/importance_sampling_ratio/mean": 0.9992881417274475, + "sampling/importance_sampling_ratio/min": 0.6407450437545776, + "sampling/sampling_logp_difference/max": 0.48691463470458984, + "sampling/sampling_logp_difference/mean": 0.015799298882484436, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 196.28125, + "completions/mean_terminated_length": 196.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.5211617946624756, + "epoch": 1.4730392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023797699100773104, + "kl": 0.0537765696644783, + "learning_rate": 6.043042250464004e-07, + "loss": 0.0005, + "num_tokens": 38087762.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5172595977783203, + "sampling/importance_sampling_ratio/mean": 0.9997826814651489, + "sampling/importance_sampling_ratio/min": 0.731545090675354, + "sampling/sampling_logp_difference/max": 0.41690587997436523, + "sampling/sampling_logp_difference/mean": 0.016150303184986115, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 176.109375, + "completions/mean_terminated_length": 176.109375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.5632998943328857, + "epoch": 1.4742647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8665966289860008, + "kl": 0.12040524184703827, + "learning_rate": 6.036074130154071e-07, + "loss": 0.0063, + "num_tokens": 38115785.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4598504304885864, + "sampling/importance_sampling_ratio/mean": 1.000475287437439, + "sampling/importance_sampling_ratio/min": 0.684594452381134, + "sampling/sampling_logp_difference/max": 0.37892866134643555, + "sampling/sampling_logp_difference/mean": 0.01630369946360588, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 213.109375, + "completions/mean_terminated_length": 213.109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3936324119567871, + "epoch": 1.4754901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018966758991156338, + "kl": 0.03494948893785477, + "learning_rate": 6.029103906679293e-07, + "loss": 0.0003, + "num_tokens": 38146064.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2813870906829834, + "sampling/importance_sampling_ratio/mean": 1.0001485347747803, + "sampling/importance_sampling_ratio/min": 0.7350302338600159, + "sampling/sampling_logp_difference/max": 0.3078436851501465, + "sampling/sampling_logp_difference/mean": 0.0130241010338068, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 190.40625, + "completions/mean_terminated_length": 190.40625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.48241475224494934, + "epoch": 1.4767156862745099, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02191195895971036, + "kl": 0.042058248072862625, + "learning_rate": 6.022131594188777e-07, + "loss": 0.0004, + "num_tokens": 38182682.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5201222896575928, + "sampling/importance_sampling_ratio/mean": 0.9998643398284912, + "sampling/importance_sampling_ratio/min": 0.7046812176704407, + "sampling/sampling_logp_difference/max": 0.4187908172607422, + "sampling/sampling_logp_difference/mean": 0.015262596309185028, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 188.109375, + "completions/mean_terminated_length": 188.109375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.5057660937309265, + "epoch": 1.4779411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027054224214642, + "kl": 0.056050047278404236, + "learning_rate": 6.01515720683588e-07, + "loss": 0.0005, + "num_tokens": 38208337.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.605932593345642, + "sampling/importance_sampling_ratio/mean": 0.9995064735412598, + "sampling/importance_sampling_ratio/min": 0.6954973936080933, + "sampling/sampling_logp_difference/max": 0.47370457649230957, + "sampling/sampling_logp_difference/mean": 0.017247628420591354, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 181.390625, + "completions/mean_terminated_length": 181.390625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.4926803708076477, + "epoch": 1.4791666666666667, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7028311180043693, + "kl": 0.06302222609519958, + "learning_rate": 6.008180758778166e-07, + "loss": -0.0068, + "num_tokens": 38237578.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.2768861055374146, + "sampling/importance_sampling_ratio/mean": 0.9997392892837524, + "sampling/importance_sampling_ratio/min": 0.6850432753562927, + "sampling/sampling_logp_difference/max": 0.37827324867248535, + "sampling/sampling_logp_difference/mean": 0.016209810972213745, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 196.796875, + "completions/mean_terminated_length": 196.796875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.5411112904548645, + "epoch": 1.4803921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02817009766619767, + "kl": 0.060022253543138504, + "learning_rate": 6.001202264177382e-07, + "loss": 0.0006, + "num_tokens": 38269341.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6009783744812012, + "sampling/importance_sampling_ratio/mean": 0.9997267127037048, + "sampling/importance_sampling_ratio/min": 0.650458037853241, + "sampling/sampling_logp_difference/max": 0.4706149101257324, + "sampling/sampling_logp_difference/mean": 0.016653675585985184, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 217.90625, + "completions/mean_terminated_length": 217.90625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.45477426052093506, + "epoch": 1.4816176470588236, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02109452365953674, + "kl": 0.038713470101356506, + "learning_rate": 5.99422173719943e-07, + "loss": 0.0004, + "num_tokens": 38300023.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8954544067382812, + "sampling/importance_sampling_ratio/mean": 0.9999589920043945, + "sampling/importance_sampling_ratio/min": 0.6483321189880371, + "sampling/sampling_logp_difference/max": 0.6394586563110352, + "sampling/sampling_logp_difference/mean": 0.01450974028557539, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 185.15625, + "completions/mean_terminated_length": 185.15625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5922730565071106, + "epoch": 1.482843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027701588601640254, + "kl": 0.07016173005104065, + "learning_rate": 5.987239192014335e-07, + "loss": 0.0007, + "num_tokens": 38333857.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.542392611503601, + "sampling/importance_sampling_ratio/mean": 1.0000760555267334, + "sampling/importance_sampling_ratio/min": 0.5946744084358215, + "sampling/sampling_logp_difference/max": 0.5197412967681885, + "sampling/sampling_logp_difference/mean": 0.017837852239608765, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 185.6875, + "completions/mean_terminated_length": 185.6875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.6171554327011108, + "epoch": 1.4840686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.91756634392374, + "kl": 0.10552050173282623, + "learning_rate": 5.980254642796226e-07, + "loss": 0.0181, + "num_tokens": 38362701.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.287684679031372, + "sampling/importance_sampling_ratio/mean": 0.9996305704116821, + "sampling/importance_sampling_ratio/min": 0.6957054138183594, + "sampling/sampling_logp_difference/max": 0.36282896995544434, + "sampling/sampling_logp_difference/mean": 0.017641430720686913, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 169.1875, + "completions/mean_terminated_length": 169.1875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4842797815799713, + "epoch": 1.4852941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.828278066243578, + "kl": 0.059575796127319336, + "learning_rate": 5.973268103723293e-07, + "loss": -0.017, + "num_tokens": 38388553.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3947969675064087, + "sampling/importance_sampling_ratio/mean": 1.000115156173706, + "sampling/importance_sampling_ratio/min": 0.6172971725463867, + "sampling/sampling_logp_difference/max": 0.4824047088623047, + "sampling/sampling_logp_difference/mean": 0.015291726216673851, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 243.53125, + "completions/mean_terminated_length": 243.53125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5568035244941711, + "epoch": 1.4865196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9233342733029263, + "kl": 0.05577544867992401, + "learning_rate": 5.966279588977766e-07, + "loss": 0.001, + "num_tokens": 38424603.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.2761130332946777, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.7563543319702148, + "sampling/sampling_logp_difference/max": 0.27924537658691406, + "sampling/sampling_logp_difference/mean": 0.015990786254405975, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 198.078125, + "completions/mean_terminated_length": 198.078125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4018840193748474, + "epoch": 1.4877450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8715222833589117, + "kl": 0.0444413498044014, + "learning_rate": 5.959289112745891e-07, + "loss": -0.0033, + "num_tokens": 38455408.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5815471410751343, + "sampling/importance_sampling_ratio/mean": 1.0003597736358643, + "sampling/importance_sampling_ratio/min": 0.6324144005775452, + "sampling/sampling_logp_difference/max": 0.4584035873413086, + "sampling/sampling_logp_difference/mean": 0.013145819306373596, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 189.5625, + "completions/mean_terminated_length": 189.5625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4872584939002991, + "epoch": 1.4889705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6873065312042486, + "kl": 0.07594792544841766, + "learning_rate": 5.952296689217889e-07, + "loss": -0.0159, + "num_tokens": 38483396.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.55042564868927, + "sampling/importance_sampling_ratio/mean": 1.0008623600006104, + "sampling/importance_sampling_ratio/min": 0.6311050057411194, + "sampling/sampling_logp_difference/max": 0.4602830410003662, + "sampling/sampling_logp_difference/mean": 0.01651753857731819, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 172.921875, + "completions/mean_terminated_length": 172.921875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.39633241295814514, + "epoch": 1.4901960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02532607640777845, + "kl": 0.04810824990272522, + "learning_rate": 5.945302332587938e-07, + "loss": 0.0004, + "num_tokens": 38511055.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4757611751556396, + "sampling/importance_sampling_ratio/mean": 0.999756395816803, + "sampling/importance_sampling_ratio/min": 0.6455199718475342, + "sampling/sampling_logp_difference/max": 0.43769919872283936, + "sampling/sampling_logp_difference/mean": 0.014180956408381462, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 228.609375, + "completions/mean_terminated_length": 228.609375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.450186163187027, + "epoch": 1.491421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8566550667556685, + "kl": 0.05234798416495323, + "learning_rate": 5.938306057054138e-07, + "loss": 0.0115, + "num_tokens": 38542038.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.2922401428222656, + "sampling/importance_sampling_ratio/mean": 0.9993905425071716, + "sampling/importance_sampling_ratio/min": 0.7026405930519104, + "sampling/sampling_logp_difference/max": 0.35290980339050293, + "sampling/sampling_logp_difference/mean": 0.013949232175946236, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 244.828125, + "completions/mean_terminated_length": 244.828125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.37430262565612793, + "epoch": 1.4926470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0171551854812675, + "kl": 0.03328785300254822, + "learning_rate": 5.931307876818487e-07, + "loss": 0.0003, + "num_tokens": 38576619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5551716089248657, + "sampling/importance_sampling_ratio/mean": 0.9995331764221191, + "sampling/importance_sampling_ratio/min": 0.6900882124900818, + "sampling/sampling_logp_difference/max": 0.441585898399353, + "sampling/sampling_logp_difference/mean": 0.012714147567749023, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 182.484375, + "completions/mean_terminated_length": 182.484375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.542378306388855, + "epoch": 1.4938725490196079, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5182456389303929, + "kl": 0.07905251532793045, + "learning_rate": 5.924307806086843e-07, + "loss": 0.054, + "num_tokens": 38604570.0, + "reward": 0.5, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5722545385360718, + "sampling/importance_sampling_ratio/mean": 1.0004451274871826, + "sampling/importance_sampling_ratio/min": 0.6802780628204346, + "sampling/sampling_logp_difference/max": 0.4525105953216553, + "sampling/sampling_logp_difference/mean": 0.01747533492743969, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 182.28125, + "completions/mean_terminated_length": 182.28125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4274052381515503, + "epoch": 1.4950980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.066348521021786, + "kl": 0.04405607283115387, + "learning_rate": 5.917305859068911e-07, + "loss": 0.0129, + "num_tokens": 38633372.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.47055184841156, + "sampling/importance_sampling_ratio/mean": 0.999843955039978, + "sampling/importance_sampling_ratio/min": 0.7588876485824585, + "sampling/sampling_logp_difference/max": 0.3856377601623535, + "sampling/sampling_logp_difference/mean": 0.014215231873095036, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 218.109375, + "completions/mean_terminated_length": 218.109375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3771829903125763, + "epoch": 1.4963235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7445440441300208, + "kl": 0.040690891444683075, + "learning_rate": 5.910302049978199e-07, + "loss": 0.0228, + "num_tokens": 38664627.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4985706806182861, + "sampling/importance_sampling_ratio/mean": 0.9998940825462341, + "sampling/importance_sampling_ratio/min": 0.6546953320503235, + "sampling/sampling_logp_difference/max": 0.42358529567718506, + "sampling/sampling_logp_difference/mean": 0.012551363557577133, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 158.46875, + "completions/mean_terminated_length": 158.46875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.5464028120040894, + "epoch": 1.4975490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9294205998653842, + "kl": 0.07251539826393127, + "learning_rate": 5.903296393031995e-07, + "loss": 0.0013, + "num_tokens": 38692945.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2881121635437012, + "sampling/importance_sampling_ratio/mean": 1.0003340244293213, + "sampling/importance_sampling_ratio/min": 0.626869797706604, + "sampling/sampling_logp_difference/max": 0.46701645851135254, + "sampling/sampling_logp_difference/mean": 0.017922550439834595, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 203.46875, + "completions/mean_terminated_length": 203.46875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.45503854751586914, + "epoch": 1.4987745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0190737118625132, + "kl": 0.03507302328944206, + "learning_rate": 5.896288902451338e-07, + "loss": 0.0003, + "num_tokens": 38728687.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.930330753326416, + "sampling/importance_sampling_ratio/mean": 0.9999499917030334, + "sampling/importance_sampling_ratio/min": 0.6622678637504578, + "sampling/sampling_logp_difference/max": 0.6576913595199585, + "sampling/sampling_logp_difference/mean": 0.01384247001260519, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 174.015625, + "completions/mean_terminated_length": 174.015625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3848019242286682, + "epoch": 1.5, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8979730657892885, + "kl": 0.05828709900379181, + "learning_rate": 5.88927959246099e-07, + "loss": 0.0029, + "num_tokens": 38755856.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4173165559768677, + "sampling/importance_sampling_ratio/mean": 0.999789297580719, + "sampling/importance_sampling_ratio/min": 0.36534881591796875, + "sampling/sampling_logp_difference/max": 1.0069026947021484, + "sampling/sampling_logp_difference/mean": 0.014410626143217087, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 179.328125, + "completions/mean_terminated_length": 179.328125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.5048831105232239, + "epoch": 1.5012254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9254894215425021, + "kl": 0.06378062814474106, + "learning_rate": 5.882268477289408e-07, + "loss": 0.029, + "num_tokens": 38788613.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6207302808761597, + "sampling/importance_sampling_ratio/mean": 1.0001567602157593, + "sampling/importance_sampling_ratio/min": 0.6898323893547058, + "sampling/sampling_logp_difference/max": 0.4828767776489258, + "sampling/sampling_logp_difference/mean": 0.015225782990455627, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 196.625, + "completions/mean_terminated_length": 196.625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.47566431760787964, + "epoch": 1.5024509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02140793104362899, + "kl": 0.035288769751787186, + "learning_rate": 5.875255571168709e-07, + "loss": 0.0004, + "num_tokens": 38817165.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4393155574798584, + "sampling/importance_sampling_ratio/mean": 1.0006489753723145, + "sampling/importance_sampling_ratio/min": 0.6677610874176025, + "sampling/sampling_logp_difference/max": 0.4038248062133789, + "sampling/sampling_logp_difference/mean": 0.016018467023968697, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 172.203125, + "completions/mean_terminated_length": 172.203125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.42229175567626953, + "epoch": 1.5036764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023800846567112507, + "kl": 0.048726920038461685, + "learning_rate": 5.868240888334652e-07, + "loss": 0.0005, + "num_tokens": 38844458.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.283900499343872, + "sampling/importance_sampling_ratio/mean": 0.9996721744537354, + "sampling/importance_sampling_ratio/min": 0.6884435415267944, + "sampling/sampling_logp_difference/max": 0.3733220100402832, + "sampling/sampling_logp_difference/mean": 0.01524503342807293, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 220.796875, + "completions/mean_terminated_length": 220.796875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.5032088160514832, + "epoch": 1.5049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021647534745950717, + "kl": 0.05585585534572601, + "learning_rate": 5.861224443026595e-07, + "loss": 0.0005, + "num_tokens": 38878813.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3067570924758911, + "sampling/importance_sampling_ratio/mean": 0.9995409250259399, + "sampling/importance_sampling_ratio/min": 0.703955352306366, + "sampling/sampling_logp_difference/max": 0.3510403633117676, + "sampling/sampling_logp_difference/mean": 0.015978462994098663, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 183.90625, + "completions/mean_terminated_length": 183.90625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4661997854709625, + "epoch": 1.5061274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7744045584456632, + "kl": 0.06239331141114235, + "learning_rate": 5.854206249487478e-07, + "loss": -0.0115, + "num_tokens": 38905527.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5400595664978027, + "sampling/importance_sampling_ratio/mean": 1.0004256963729858, + "sampling/importance_sampling_ratio/min": 0.6938456892967224, + "sampling/sampling_logp_difference/max": 0.4318211078643799, + "sampling/sampling_logp_difference/mean": 0.015041721984744072, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 197.390625, + "completions/mean_terminated_length": 197.390625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3444875180721283, + "epoch": 1.5073529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02053351113351947, + "kl": 0.036030255258083344, + "learning_rate": 5.847186321963792e-07, + "loss": 0.0003, + "num_tokens": 38936752.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2809301614761353, + "sampling/importance_sampling_ratio/mean": 0.9998122453689575, + "sampling/importance_sampling_ratio/min": 0.6104072332382202, + "sampling/sampling_logp_difference/max": 0.49362897872924805, + "sampling/sampling_logp_difference/mean": 0.012584058567881584, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 196.203125, + "completions/mean_terminated_length": 196.203125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4444473087787628, + "epoch": 1.508578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025489591186659628, + "kl": 0.035917483270168304, + "learning_rate": 5.840164674705542e-07, + "loss": 0.0003, + "num_tokens": 38967997.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4047203063964844, + "sampling/importance_sampling_ratio/mean": 0.999308705329895, + "sampling/importance_sampling_ratio/min": 0.5704594850540161, + "sampling/sampling_logp_difference/max": 0.5613131523132324, + "sampling/sampling_logp_difference/mean": 0.01690061390399933, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 185.21875, + "completions/mean_terminated_length": 185.21875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.39085036516189575, + "epoch": 1.5098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02106353212827986, + "kl": 0.03460868448019028, + "learning_rate": 5.833141321966228e-07, + "loss": 0.0003, + "num_tokens": 39000619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5278422832489014, + "sampling/importance_sampling_ratio/mean": 0.9994332790374756, + "sampling/importance_sampling_ratio/min": 0.6895950436592102, + "sampling/sampling_logp_difference/max": 0.4238564968109131, + "sampling/sampling_logp_difference/mean": 0.014932571910321712, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 164.390625, + "completions/mean_terminated_length": 164.390625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3618762493133545, + "epoch": 1.5110294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020524870792948353, + "kl": 0.03939305990934372, + "learning_rate": 5.826116278002813e-07, + "loss": 0.0004, + "num_tokens": 39025684.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.434867262840271, + "sampling/importance_sampling_ratio/mean": 0.9991062879562378, + "sampling/importance_sampling_ratio/min": 0.6994453072547913, + "sampling/sampling_logp_difference/max": 0.361072301864624, + "sampling/sampling_logp_difference/mean": 0.014167373068630695, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 211.9375, + "completions/mean_terminated_length": 211.9375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4231787621974945, + "epoch": 1.5122549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7374625059211084, + "kl": 0.04008599370718002, + "learning_rate": 5.819089557075688e-07, + "loss": -0.0103, + "num_tokens": 39057216.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5467814207077026, + "sampling/importance_sampling_ratio/mean": 1.0004159212112427, + "sampling/importance_sampling_ratio/min": 0.6743151545524597, + "sampling/sampling_logp_difference/max": 0.4361763000488281, + "sampling/sampling_logp_difference/mean": 0.015356146730482578, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 197.40625, + "completions/mean_terminated_length": 197.40625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.39275944232940674, + "epoch": 1.5134803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7440143713536784, + "kl": 0.054746758192777634, + "learning_rate": 5.812061173448654e-07, + "loss": -0.0005, + "num_tokens": 39090490.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8623653650283813, + "sampling/importance_sampling_ratio/mean": 0.9998970627784729, + "sampling/importance_sampling_ratio/min": 0.6154475808143616, + "sampling/sampling_logp_difference/max": 0.62184739112854, + "sampling/sampling_logp_difference/mean": 0.015792280435562134, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 204.71875, + "completions/mean_terminated_length": 204.71875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.40649598836898804, + "epoch": 1.5147058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9315226544513092, + "kl": 0.03523103892803192, + "learning_rate": 5.805031141388883e-07, + "loss": 0.0305, + "num_tokens": 39124760.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.438776969909668, + "sampling/importance_sampling_ratio/mean": 1.0000498294830322, + "sampling/importance_sampling_ratio/min": 0.6908988356590271, + "sampling/sampling_logp_difference/max": 0.36976194381713867, + "sampling/sampling_logp_difference/mean": 0.014396263286471367, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 193.09375, + "completions/mean_terminated_length": 193.09375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3641846776008606, + "epoch": 1.5159313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7727836115847848, + "kl": 0.04704434424638748, + "learning_rate": 5.797999475166896e-07, + "loss": -0.0046, + "num_tokens": 39168782.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5360101461410522, + "sampling/importance_sampling_ratio/mean": 0.9998215436935425, + "sampling/importance_sampling_ratio/min": 0.7178670763969421, + "sampling/sampling_logp_difference/max": 0.42918825149536133, + "sampling/sampling_logp_difference/mean": 0.012815626338124275, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 194.203125, + "completions/mean_terminated_length": 194.203125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4459923505783081, + "epoch": 1.517156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7709668361144417, + "kl": 0.05876849964261055, + "learning_rate": 5.790966189056529e-07, + "loss": -0.0015, + "num_tokens": 39200987.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5467361211776733, + "sampling/importance_sampling_ratio/mean": 0.9998927116394043, + "sampling/importance_sampling_ratio/min": 0.6045164465904236, + "sampling/sampling_logp_difference/max": 0.503326416015625, + "sampling/sampling_logp_difference/mean": 0.01612141728401184, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 213.5625, + "completions/mean_terminated_length": 213.5625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.468833863735199, + "epoch": 1.5183823529411766, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3745813326523788, + "kl": 0.043368056416511536, + "learning_rate": 5.783931297334907e-07, + "loss": 0.0629, + "num_tokens": 39237087.0, + "reward": 0.8125, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4521344900131226, + "sampling/importance_sampling_ratio/mean": 0.9998012185096741, + "sampling/importance_sampling_ratio/min": 0.6323934197425842, + "sampling/sampling_logp_difference/max": 0.45824360847473145, + "sampling/sampling_logp_difference/mean": 0.015378328040242195, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 215.53125, + "completions/mean_terminated_length": 215.53125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.47365593910217285, + "epoch": 1.5196078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02133304870081858, + "kl": 0.055505771189928055, + "learning_rate": 5.776894814282415e-07, + "loss": 0.0004, + "num_tokens": 39268145.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4176025390625, + "sampling/importance_sampling_ratio/mean": 0.9997040033340454, + "sampling/importance_sampling_ratio/min": 0.689681887626648, + "sampling/sampling_logp_difference/max": 0.3715248107910156, + "sampling/sampling_logp_difference/mean": 0.016383890062570572, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 177.15625, + "completions/mean_terminated_length": 177.15625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.44920074939727783, + "epoch": 1.5208333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02596466359665803, + "kl": 0.036328621208667755, + "learning_rate": 5.769856754182667e-07, + "loss": 0.0004, + "num_tokens": 39298299.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4447506666183472, + "sampling/importance_sampling_ratio/mean": 0.9996228218078613, + "sampling/importance_sampling_ratio/min": 0.6075159907341003, + "sampling/sampling_logp_difference/max": 0.49837684631347656, + "sampling/sampling_logp_difference/mean": 0.016213443130254745, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 177.40625, + "completions/mean_terminated_length": 177.40625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.406511127948761, + "epoch": 1.5220588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.935093556595975, + "kl": 0.03980601578950882, + "learning_rate": 5.762817131322481e-07, + "loss": 0.0056, + "num_tokens": 39326789.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.405997633934021, + "sampling/importance_sampling_ratio/mean": 1.0002541542053223, + "sampling/importance_sampling_ratio/min": 0.6896554231643677, + "sampling/sampling_logp_difference/max": 0.371563196182251, + "sampling/sampling_logp_difference/mean": 0.016482625156641006, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 181.84375, + "completions/mean_terminated_length": 181.84375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.41599082946777344, + "epoch": 1.5232843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7021961826808495, + "kl": 0.04142145812511444, + "learning_rate": 5.755775959991844e-07, + "loss": -0.0041, + "num_tokens": 39356171.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.2728204727172852, + "sampling/importance_sampling_ratio/mean": 0.9999167323112488, + "sampling/importance_sampling_ratio/min": 0.6699854135513306, + "sampling/sampling_logp_difference/max": 0.4004993438720703, + "sampling/sampling_logp_difference/mean": 0.014757356606423855, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 142.78125, + "completions/mean_terminated_length": 142.78125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4153823256492615, + "epoch": 1.5245098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03757600758983537, + "kl": 0.0534350611269474, + "learning_rate": 5.74873325448389e-07, + "loss": 0.0005, + "num_tokens": 39380349.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5157074928283691, + "sampling/importance_sampling_ratio/mean": 1.0008015632629395, + "sampling/importance_sampling_ratio/min": 0.6309089660644531, + "sampling/sampling_logp_difference/max": 0.46059370040893555, + "sampling/sampling_logp_difference/mean": 0.01728636398911476, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 162.4375, + "completions/mean_terminated_length": 162.4375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.38047099113464355, + "epoch": 1.5257352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027273282927566717, + "kl": 0.03636075556278229, + "learning_rate": 5.741689029094861e-07, + "loss": 0.0003, + "num_tokens": 39407177.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.600734829902649, + "sampling/importance_sampling_ratio/mean": 1.000301480293274, + "sampling/importance_sampling_ratio/min": 0.6788877248764038, + "sampling/sampling_logp_difference/max": 0.4704627990722656, + "sampling/sampling_logp_difference/mean": 0.014349868521094322, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 185.3125, + "completions/mean_terminated_length": 185.3125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3892589211463928, + "epoch": 1.5269607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9945046565892149, + "kl": 0.05229416489601135, + "learning_rate": 5.73464329812409e-07, + "loss": -0.0324, + "num_tokens": 39433757.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.7655868530273438, + "sampling/importance_sampling_ratio/mean": 1.0002690553665161, + "sampling/importance_sampling_ratio/min": 0.7580675482749939, + "sampling/sampling_logp_difference/max": 0.5684831142425537, + "sampling/sampling_logp_difference/mean": 0.015093202702701092, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3466736078262329, + "epoch": 1.528186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8782710940419933, + "kl": 0.04014609381556511, + "learning_rate": 5.727596075873965e-07, + "loss": 0.0098, + "num_tokens": 39458949.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4058247804641724, + "sampling/importance_sampling_ratio/mean": 1.0001182556152344, + "sampling/importance_sampling_ratio/min": 0.6978297829627991, + "sampling/sampling_logp_difference/max": 0.35978007316589355, + "sampling/sampling_logp_difference/mean": 0.013260153122246265, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 148.21875, + "completions/mean_terminated_length": 148.21875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.41673874855041504, + "epoch": 1.5294117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.115362662571301, + "kl": 0.07008904218673706, + "learning_rate": 5.7205473766499e-07, + "loss": -0.0083, + "num_tokens": 39488035.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.582716941833496, + "sampling/importance_sampling_ratio/mean": 1.0004963874816895, + "sampling/importance_sampling_ratio/min": 0.4275853931903839, + "sampling/sampling_logp_difference/max": 0.8496012687683105, + "sampling/sampling_logp_difference/mean": 0.017482828348875046, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 157.703125, + "completions/mean_terminated_length": 157.703125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.42711418867111206, + "epoch": 1.530637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8976514209124797, + "kl": 0.06900682300329208, + "learning_rate": 5.71349721476031e-07, + "loss": 0.0061, + "num_tokens": 39518224.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6055638790130615, + "sampling/importance_sampling_ratio/mean": 1.0001951456069946, + "sampling/importance_sampling_ratio/min": 0.6225906610488892, + "sampling/sampling_logp_difference/max": 0.47386598587036133, + "sampling/sampling_logp_difference/mean": 0.015635859221220016, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 223.875, + "completions/mean_terminated_length": 223.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4811854362487793, + "epoch": 1.531862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023626790688363004, + "kl": 0.04002731293439865, + "learning_rate": 5.706445604516574e-07, + "loss": 0.0004, + "num_tokens": 39561032.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3421891927719116, + "sampling/importance_sampling_ratio/mean": 0.9997655153274536, + "sampling/importance_sampling_ratio/min": 0.6639184355735779, + "sampling/sampling_logp_difference/max": 0.40959596633911133, + "sampling/sampling_logp_difference/mean": 0.01641765981912613, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 164.359375, + "completions/mean_terminated_length": 164.359375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.4115786552429199, + "epoch": 1.5330882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9757896386048582, + "kl": 0.08084049820899963, + "learning_rate": 5.699392560233017e-07, + "loss": 0.0328, + "num_tokens": 39588863.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999293088912964, + "sampling/importance_sampling_ratio/min": 0.5453224778175354, + "sampling/sampling_logp_difference/max": 0.8030078411102295, + "sampling/sampling_logp_difference/mean": 0.01555370632559061, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 168.9375, + "completions/mean_terminated_length": 168.9375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3465157747268677, + "epoch": 1.534313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02178792693278117, + "kl": 0.03237396106123924, + "learning_rate": 5.69233809622687e-07, + "loss": 0.0003, + "num_tokens": 39617035.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.524991750717163, + "sampling/importance_sampling_ratio/mean": 1.0002695322036743, + "sampling/importance_sampling_ratio/min": 0.714755654335022, + "sampling/sampling_logp_difference/max": 0.42198896408081055, + "sampling/sampling_logp_difference/mean": 0.013200388289988041, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 173.296875, + "completions/mean_terminated_length": 173.296875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3633451461791992, + "epoch": 1.5355392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020463538693140625, + "kl": 0.035572197288274765, + "learning_rate": 5.685282226818249e-07, + "loss": 0.0003, + "num_tokens": 39649118.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4389134645462036, + "sampling/importance_sampling_ratio/mean": 0.9997230768203735, + "sampling/importance_sampling_ratio/min": 0.6744645237922668, + "sampling/sampling_logp_difference/max": 0.39383625984191895, + "sampling/sampling_logp_difference/mean": 0.014470847323536873, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 161.484375, + "completions/mean_terminated_length": 161.484375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.47554466128349304, + "epoch": 1.5367647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.254555955122257, + "kl": 0.03972441703081131, + "learning_rate": 5.678224966330119e-07, + "loss": -0.0012, + "num_tokens": 39680525.0, + "reward": 0.3125, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 0.9994322657585144, + "sampling/importance_sampling_ratio/min": 0.7094095349311829, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.015320624224841595, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 158.3125, + "completions/mean_terminated_length": 158.3125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.40946847200393677, + "epoch": 1.5379901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02762295709941761, + "kl": 0.047129470854997635, + "learning_rate": 5.671166329088277e-07, + "loss": 0.0004, + "num_tokens": 39706497.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.292691946029663, + "sampling/importance_sampling_ratio/mean": 0.9992033243179321, + "sampling/importance_sampling_ratio/min": 0.69576495885849, + "sampling/sampling_logp_difference/max": 0.3627433776855469, + "sampling/sampling_logp_difference/mean": 0.01570296101272106, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 174.609375, + "completions/mean_terminated_length": 174.609375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.35514920949935913, + "epoch": 1.5392156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01934494958303013, + "kl": 0.029100321233272552, + "learning_rate": 5.664106329421305e-07, + "loss": 0.0003, + "num_tokens": 39735928.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4580605030059814, + "sampling/importance_sampling_ratio/mean": 1.0000929832458496, + "sampling/importance_sampling_ratio/min": 0.6222665905952454, + "sampling/sampling_logp_difference/max": 0.47438669204711914, + "sampling/sampling_logp_difference/mean": 0.012476583942770958, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 195.078125, + "completions/mean_terminated_length": 195.078125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.43784743547439575, + "epoch": 1.5404411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0638600413008341, + "kl": 0.04495566338300705, + "learning_rate": 5.657044981660559e-07, + "loss": 0.0113, + "num_tokens": 39768541.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5521928071975708, + "sampling/importance_sampling_ratio/mean": 0.9999226331710815, + "sampling/importance_sampling_ratio/min": 0.6645677089691162, + "sampling/sampling_logp_difference/max": 0.4396686553955078, + "sampling/sampling_logp_difference/mean": 0.015002140775322914, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 156.09375, + "completions/mean_terminated_length": 156.09375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.42065680027008057, + "epoch": 1.5416666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02766408901112148, + "kl": 0.04885207861661911, + "learning_rate": 5.649982300140123e-07, + "loss": 0.0005, + "num_tokens": 39795427.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4181545972824097, + "sampling/importance_sampling_ratio/mean": 0.999775767326355, + "sampling/importance_sampling_ratio/min": 0.7194200754165649, + "sampling/sampling_logp_difference/max": 0.34935641288757324, + "sampling/sampling_logp_difference/mean": 0.016253039240837097, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 149.328125, + "completions/mean_terminated_length": 149.328125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3208079934120178, + "epoch": 1.5428921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038933861003191204, + "kl": 0.04223839193582535, + "learning_rate": 5.642918299196796e-07, + "loss": 0.0004, + "num_tokens": 39819848.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3146650791168213, + "sampling/importance_sampling_ratio/mean": 0.9999975562095642, + "sampling/importance_sampling_ratio/min": 0.7336270809173584, + "sampling/sampling_logp_difference/max": 0.30975449085235596, + "sampling/sampling_logp_difference/mean": 0.01251753605902195, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 146.109375, + "completions/mean_terminated_length": 146.109375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3680001199245453, + "epoch": 1.5441176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026802418251748644, + "kl": 0.04381459206342697, + "learning_rate": 5.635852993170052e-07, + "loss": 0.0004, + "num_tokens": 39842927.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5502644777297974, + "sampling/importance_sampling_ratio/mean": 1.000135898590088, + "sampling/importance_sampling_ratio/min": 0.6775410175323486, + "sampling/sampling_logp_difference/max": 0.43842554092407227, + "sampling/sampling_logp_difference/mean": 0.015402581542730331, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 190.6875, + "completions/mean_terminated_length": 190.6875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.39454811811447144, + "epoch": 1.545343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02253259241212441, + "kl": 0.036204319447278976, + "learning_rate": 5.628786396402013e-07, + "loss": 0.0004, + "num_tokens": 39873771.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6648271083831787, + "sampling/importance_sampling_ratio/mean": 1.0001277923583984, + "sampling/importance_sampling_ratio/min": 0.6251299977302551, + "sampling/sampling_logp_difference/max": 0.5097212791442871, + "sampling/sampling_logp_difference/mean": 0.014238608069717884, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 139.796875, + "completions/mean_terminated_length": 139.796875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.31300443410873413, + "epoch": 1.5465686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02311308752613259, + "kl": 0.03415478765964508, + "learning_rate": 5.621718523237426e-07, + "loss": 0.0003, + "num_tokens": 39898766.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3027833700180054, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.6423305869102478, + "sampling/sampling_logp_difference/max": 0.44265222549438477, + "sampling/sampling_logp_difference/mean": 0.012356936000287533, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 197.3125, + "completions/mean_terminated_length": 197.3125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3989219069480896, + "epoch": 1.5477941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8978740474774581, + "kl": 0.05985507369041443, + "learning_rate": 5.614649388023622e-07, + "loss": 0.0225, + "num_tokens": 39928338.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.3391810655593872, + "sampling/importance_sampling_ratio/mean": 0.9998857975006104, + "sampling/importance_sampling_ratio/min": 0.7125557065010071, + "sampling/sampling_logp_difference/max": 0.3388972282409668, + "sampling/sampling_logp_difference/mean": 0.014779426157474518, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 183.640625, + "completions/mean_terminated_length": 183.640625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.30833882093429565, + "epoch": 1.5490196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019367216326750366, + "kl": 0.02864927239716053, + "learning_rate": 5.607579005110502e-07, + "loss": 0.0003, + "num_tokens": 39955419.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5163096189498901, + "sampling/importance_sampling_ratio/mean": 0.9996656179428101, + "sampling/importance_sampling_ratio/min": 0.6625163555145264, + "sampling/sampling_logp_difference/max": 0.41627955436706543, + "sampling/sampling_logp_difference/mean": 0.012281282804906368, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 180.03125, + "completions/mean_terminated_length": 180.03125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3912290930747986, + "epoch": 1.5502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8134361518544091, + "kl": 0.0405205562710762, + "learning_rate": 5.60050738885049e-07, + "loss": -0.0021, + "num_tokens": 39984301.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.362772822380066, + "sampling/importance_sampling_ratio/mean": 0.9998492002487183, + "sampling/importance_sampling_ratio/min": 0.744883120059967, + "sampling/sampling_logp_difference/max": 0.3095214366912842, + "sampling/sampling_logp_difference/mean": 0.015420593321323395, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 212.90625, + "completions/mean_terminated_length": 212.90625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4786534011363983, + "epoch": 1.5514705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020424667846182472, + "kl": 0.035735562443733215, + "learning_rate": 5.593434553598525e-07, + "loss": 0.0004, + "num_tokens": 40017591.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.328540325164795, + "sampling/importance_sampling_ratio/mean": 0.9997981786727905, + "sampling/importance_sampling_ratio/min": 0.6714566349983215, + "sampling/sampling_logp_difference/max": 0.39830589294433594, + "sampling/sampling_logp_difference/mean": 0.017017975449562073, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 190.828125, + "completions/mean_terminated_length": 190.828125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.448546826839447, + "epoch": 1.5526960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.789071005407192, + "kl": 0.03808409720659256, + "learning_rate": 5.586360513712009e-07, + "loss": 0.0002, + "num_tokens": 40045132.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.361933946609497, + "sampling/importance_sampling_ratio/mean": 1.0002695322036743, + "sampling/importance_sampling_ratio/min": 0.617332935333252, + "sampling/sampling_logp_difference/max": 0.482346773147583, + "sampling/sampling_logp_difference/mean": 0.016598699614405632, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 193.140625, + "completions/mean_terminated_length": 193.140625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.30193161964416504, + "epoch": 1.553921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016319502379357587, + "kl": 0.024532688781619072, + "learning_rate": 5.579285283550797e-07, + "loss": 0.0002, + "num_tokens": 40077301.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4356786012649536, + "sampling/importance_sampling_ratio/mean": 0.9999880194664001, + "sampling/importance_sampling_ratio/min": 0.6256784200668335, + "sampling/sampling_logp_difference/max": 0.4689188003540039, + "sampling/sampling_logp_difference/mean": 0.011309798806905746, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 209.421875, + "completions/mean_terminated_length": 209.421875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3677401840686798, + "epoch": 1.5551470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015574414670255563, + "kl": 0.03165864199399948, + "learning_rate": 5.572208877477159e-07, + "loss": 0.0003, + "num_tokens": 40110480.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4888417720794678, + "sampling/importance_sampling_ratio/mean": 0.9999098777770996, + "sampling/importance_sampling_ratio/min": 0.4572525918483734, + "sampling/sampling_logp_difference/max": 0.7825193405151367, + "sampling/sampling_logp_difference/mean": 0.014508018270134926, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 149.328125, + "completions/mean_terminated_length": 149.328125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.37257325649261475, + "epoch": 1.5563725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022973999136349448, + "kl": 0.04484736546874046, + "learning_rate": 5.565131309855752e-07, + "loss": 0.0004, + "num_tokens": 40138037.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.54764986038208, + "sampling/importance_sampling_ratio/mean": 1.0002665519714355, + "sampling/importance_sampling_ratio/min": 0.6457412242889404, + "sampling/sampling_logp_difference/max": 0.43735647201538086, + "sampling/sampling_logp_difference/mean": 0.015225168317556381, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 226.234375, + "completions/mean_terminated_length": 226.234375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.37977835536003113, + "epoch": 1.5575980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6355925387914, + "kl": 0.03878624737262726, + "learning_rate": 5.558052595053586e-07, + "loss": -0.0043, + "num_tokens": 40176532.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5467371940612793, + "sampling/importance_sampling_ratio/mean": 1.0005635023117065, + "sampling/importance_sampling_ratio/min": 0.6368696689605713, + "sampling/sampling_logp_difference/max": 0.4511902332305908, + "sampling/sampling_logp_difference/mean": 0.013250889256596565, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 197.375, + "completions/mean_terminated_length": 197.375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.33750832080841064, + "epoch": 1.5588235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01839720443880899, + "kl": 0.030976727604866028, + "learning_rate": 5.550972747440005e-07, + "loss": 0.0003, + "num_tokens": 40203404.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6207518577575684, + "sampling/importance_sampling_ratio/mean": 1.0003291368484497, + "sampling/importance_sampling_ratio/min": 0.6080752611160278, + "sampling/sampling_logp_difference/max": 0.49745655059814453, + "sampling/sampling_logp_difference/mean": 0.014426854439079762, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 265.71875, + "completions/mean_terminated_length": 265.71875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.46240752935409546, + "epoch": 1.5600490196078431, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9579825277156595, + "kl": 0.05170414596796036, + "learning_rate": 5.543891781386655e-07, + "loss": 0.0534, + "num_tokens": 40245946.0, + "reward": 0.40625, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.3931963443756104, + "sampling/importance_sampling_ratio/mean": 0.9996916055679321, + "sampling/importance_sampling_ratio/min": 0.6954973936080933, + "sampling/sampling_logp_difference/max": 0.3631279468536377, + "sampling/sampling_logp_difference/mean": 0.014100045897066593, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 191.0, + "completions/mean_terminated_length": 191.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.34568995237350464, + "epoch": 1.5612745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026876653440361203, + "kl": 0.03240996599197388, + "learning_rate": 5.536809711267443e-07, + "loss": 0.0003, + "num_tokens": 40274010.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6146008968353271, + "sampling/importance_sampling_ratio/mean": 1.0002329349517822, + "sampling/importance_sampling_ratio/min": 0.6587986946105957, + "sampling/sampling_logp_difference/max": 0.47908782958984375, + "sampling/sampling_logp_difference/mean": 0.013071177527308464, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 195.96875, + "completions/mean_terminated_length": 195.96875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.419927179813385, + "epoch": 1.5625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022425113760107574, + "kl": 0.03857335448265076, + "learning_rate": 5.529726551458526e-07, + "loss": 0.0004, + "num_tokens": 40307144.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4335927963256836, + "sampling/importance_sampling_ratio/mean": 0.9999467730522156, + "sampling/importance_sampling_ratio/min": 0.6542967557907104, + "sampling/sampling_logp_difference/max": 0.4241943359375, + "sampling/sampling_logp_difference/mean": 0.015529746189713478, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 213.015625, + "completions/mean_terminated_length": 213.015625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.4056742191314697, + "epoch": 1.5637254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0232241560707057, + "kl": 0.03813936561346054, + "learning_rate": 5.522642316338268e-07, + "loss": 0.0003, + "num_tokens": 40338713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6377936601638794, + "sampling/importance_sampling_ratio/mean": 1.000185489654541, + "sampling/importance_sampling_ratio/min": 0.6237207651138306, + "sampling/sampling_logp_difference/max": 0.4933500289916992, + "sampling/sampling_logp_difference/mean": 0.014487029053270817, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 163.84375, + "completions/mean_terminated_length": 163.84375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4125819802284241, + "epoch": 1.5649509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030463973536295753, + "kl": 0.061399657279253006, + "learning_rate": 5.515557020287218e-07, + "loss": 0.0006, + "num_tokens": 40364751.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6288862228393555, + "sampling/importance_sampling_ratio/mean": 1.0000745058059692, + "sampling/importance_sampling_ratio/min": 0.7075063586235046, + "sampling/sampling_logp_difference/max": 0.4878964424133301, + "sampling/sampling_logp_difference/mean": 0.015359732322394848, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 214.375, + "completions/mean_terminated_length": 214.375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.4101026654243469, + "epoch": 1.5661764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03697423516826225, + "kl": 0.06247444078326225, + "learning_rate": 5.508470677688078e-07, + "loss": 0.0006, + "num_tokens": 40397063.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3432215452194214, + "sampling/importance_sampling_ratio/mean": 0.9993553161621094, + "sampling/importance_sampling_ratio/min": 0.6600155234336853, + "sampling/sampling_logp_difference/max": 0.4154919385910034, + "sampling/sampling_logp_difference/mean": 0.014263832941651344, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 212.21875, + "completions/mean_terminated_length": 212.21875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.48836657404899597, + "epoch": 1.5674019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7521601665316348, + "kl": 0.05656624212861061, + "learning_rate": 5.501383302925677e-07, + "loss": -0.0254, + "num_tokens": 40433349.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6507452726364136, + "sampling/importance_sampling_ratio/mean": 0.9997700452804565, + "sampling/importance_sampling_ratio/min": 0.6106220483779907, + "sampling/sampling_logp_difference/max": 0.5012269020080566, + "sampling/sampling_logp_difference/mean": 0.01653190888464451, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 221.34375, + "completions/mean_terminated_length": 221.34375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.43000924587249756, + "epoch": 1.5686274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020274630867750474, + "kl": 0.040560007095336914, + "learning_rate": 5.494294910386933e-07, + "loss": 0.0004, + "num_tokens": 40470571.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7524255514144897, + "sampling/importance_sampling_ratio/mean": 1.000464677810669, + "sampling/importance_sampling_ratio/min": 0.5848381519317627, + "sampling/sampling_logp_difference/max": 0.5610008239746094, + "sampling/sampling_logp_difference/mean": 0.015560484491288662, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 155.296875, + "completions/mean_terminated_length": 155.296875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.42848485708236694, + "epoch": 1.5698529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029067613428986974, + "kl": 0.049033183604478836, + "learning_rate": 5.487205514460835e-07, + "loss": 0.0005, + "num_tokens": 40496974.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4896117448806763, + "sampling/importance_sampling_ratio/mean": 1.0005073547363281, + "sampling/importance_sampling_ratio/min": 0.618243396282196, + "sampling/sampling_logp_difference/max": 0.48087310791015625, + "sampling/sampling_logp_difference/mean": 0.015883062034845352, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 149.21875, + "completions/mean_terminated_length": 149.21875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.3263126611709595, + "epoch": 1.571078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034101301532964895, + "kl": 0.044961195439100266, + "learning_rate": 5.480115129538409e-07, + "loss": 0.0004, + "num_tokens": 40525420.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.405886173248291, + "sampling/importance_sampling_ratio/mean": 1.0003024339675903, + "sampling/importance_sampling_ratio/min": 0.6408926248550415, + "sampling/sampling_logp_difference/max": 0.44489336013793945, + "sampling/sampling_logp_difference/mean": 0.013876670971512794, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 189.859375, + "completions/mean_terminated_length": 189.859375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.41377168893814087, + "epoch": 1.5723039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02701900619995695, + "kl": 0.05792352557182312, + "learning_rate": 5.473023770012686e-07, + "loss": 0.0006, + "num_tokens": 40553427.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000136137008667, + "sampling/importance_sampling_ratio/min": 0.6201323866844177, + "sampling/sampling_logp_difference/max": 0.7010538578033447, + "sampling/sampling_logp_difference/mean": 0.016421273350715637, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 197.6875, + "completions/mean_terminated_length": 197.6875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4023253917694092, + "epoch": 1.5735294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023264841923209056, + "kl": 0.04238691180944443, + "learning_rate": 5.465931450278676e-07, + "loss": 0.0004, + "num_tokens": 40585567.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.303682565689087, + "sampling/importance_sampling_ratio/mean": 0.9998633861541748, + "sampling/importance_sampling_ratio/min": 0.6077167391777039, + "sampling/sampling_logp_difference/max": 0.4980463981628418, + "sampling/sampling_logp_difference/mean": 0.014768811874091625, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 262.921875, + "completions/mean_terminated_length": 262.921875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4775471091270447, + "epoch": 1.5747549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01923680643591346, + "kl": 0.04368182271718979, + "learning_rate": 5.458838184733341e-07, + "loss": 0.0004, + "num_tokens": 40618570.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4640520811080933, + "sampling/importance_sampling_ratio/mean": 1.0002003908157349, + "sampling/importance_sampling_ratio/min": 0.6963002681732178, + "sampling/sampling_logp_difference/max": 0.3812079429626465, + "sampling/sampling_logp_difference/mean": 0.015857627615332603, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 189.203125, + "completions/mean_terminated_length": 189.203125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3754490613937378, + "epoch": 1.5759803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7177864859107354, + "kl": 0.034951746463775635, + "learning_rate": 5.451743987775559e-07, + "loss": -0.0027, + "num_tokens": 40649975.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3653631210327148, + "sampling/importance_sampling_ratio/mean": 0.9998676776885986, + "sampling/importance_sampling_ratio/min": 0.6218180656433105, + "sampling/sampling_logp_difference/max": 0.47510766983032227, + "sampling/sampling_logp_difference/mean": 0.013944664038717747, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 225.65625, + "completions/mean_terminated_length": 225.65625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4051673412322998, + "epoch": 1.5772058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5990588312894888, + "kl": 0.043059948831796646, + "learning_rate": 5.444648873806101e-07, + "loss": -0.0108, + "num_tokens": 40679457.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6007344722747803, + "sampling/importance_sampling_ratio/mean": 0.9999345541000366, + "sampling/importance_sampling_ratio/min": 0.5071929097175598, + "sampling/sampling_logp_difference/max": 0.6788637638092041, + "sampling/sampling_logp_difference/mean": 0.015464607626199722, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 178.6875, + "completions/mean_terminated_length": 178.6875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.33906644582748413, + "epoch": 1.5784313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7745910230403468, + "kl": 0.029677651822566986, + "learning_rate": 5.437552857227597e-07, + "loss": -0.0025, + "num_tokens": 40707453.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.436586618423462, + "sampling/importance_sampling_ratio/mean": 0.9997626543045044, + "sampling/importance_sampling_ratio/min": 0.6366055011749268, + "sampling/sampling_logp_difference/max": 0.45160508155822754, + "sampling/sampling_logp_difference/mean": 0.01307828351855278, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 203.234375, + "completions/mean_terminated_length": 203.234375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4389653205871582, + "epoch": 1.579656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6227759495627972, + "kl": 0.04993196576833725, + "learning_rate": 5.430455952444512e-07, + "loss": -0.0091, + "num_tokens": 40732556.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4736192226409912, + "sampling/importance_sampling_ratio/mean": 0.9992948770523071, + "sampling/importance_sampling_ratio/min": 0.6626046299934387, + "sampling/sampling_logp_difference/max": 0.4115767478942871, + "sampling/sampling_logp_difference/mean": 0.01605891063809395, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 230.03125, + "completions/mean_terminated_length": 230.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4080777168273926, + "epoch": 1.5808823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018053235471991634, + "kl": 0.034843411296606064, + "learning_rate": 5.423358173863116e-07, + "loss": 0.0003, + "num_tokens": 40765054.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4352980852127075, + "sampling/importance_sampling_ratio/mean": 1.0002480745315552, + "sampling/importance_sampling_ratio/min": 0.654730498790741, + "sampling/sampling_logp_difference/max": 0.4235316514968872, + "sampling/sampling_logp_difference/mean": 0.01420527696609497, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 201.515625, + "completions/mean_terminated_length": 201.515625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.33541762828826904, + "epoch": 1.5821078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01863793061088374, + "kl": 0.035432569682598114, + "learning_rate": 5.416259535891446e-07, + "loss": 0.0003, + "num_tokens": 40795071.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.43512761592865, + "sampling/importance_sampling_ratio/mean": 0.999920129776001, + "sampling/importance_sampling_ratio/min": 0.6430781483650208, + "sampling/sampling_logp_difference/max": 0.44148898124694824, + "sampling/sampling_logp_difference/mean": 0.012585221789777279, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 204.359375, + "completions/mean_terminated_length": 204.359375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.43739044666290283, + "epoch": 1.5833333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02115004900477092, + "kl": 0.0341816246509552, + "learning_rate": 5.409160052939291e-07, + "loss": 0.0003, + "num_tokens": 40830054.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.751325011253357, + "sampling/importance_sampling_ratio/mean": 0.9997460842132568, + "sampling/importance_sampling_ratio/min": 0.7260932922363281, + "sampling/sampling_logp_difference/max": 0.5603725910186768, + "sampling/sampling_logp_difference/mean": 0.015032818540930748, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 153.359375, + "completions/mean_terminated_length": 153.359375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3930108845233917, + "epoch": 1.5845588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03688061085249617, + "kl": 0.05595723167061806, + "learning_rate": 5.402059739418148e-07, + "loss": 0.0005, + "num_tokens": 40856589.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.597269892692566, + "sampling/importance_sampling_ratio/mean": 1.0001264810562134, + "sampling/importance_sampling_ratio/min": 0.5229350328445435, + "sampling/sampling_logp_difference/max": 0.6482980251312256, + "sampling/sampling_logp_difference/mean": 0.015565967187285423, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 196.34375, + "completions/mean_terminated_length": 196.34375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.35479289293289185, + "epoch": 1.5857843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017859915243605095, + "kl": 0.034353796392679214, + "learning_rate": 5.394958609741206e-07, + "loss": 0.0003, + "num_tokens": 40884963.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4333220720291138, + "sampling/importance_sampling_ratio/mean": 0.9996464252471924, + "sampling/importance_sampling_ratio/min": 0.6380343437194824, + "sampling/sampling_logp_difference/max": 0.44936323165893555, + "sampling/sampling_logp_difference/mean": 0.013390054926276207, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 265.265625, + "completions/mean_terminated_length": 265.265625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.5733312368392944, + "epoch": 1.5870098039215685, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.0958078551567019, + "kl": 0.06653326004743576, + "learning_rate": 5.387856678323307e-07, + "loss": 0.0205, + "num_tokens": 40919972.0, + "reward": -0.09375, + "reward_std": 0.6802700161933899, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006455183029175, + "sampling/importance_sampling_ratio/min": 0.6330235004425049, + "sampling/sampling_logp_difference/max": 0.9251754283905029, + "sampling/sampling_logp_difference/mean": 0.017458755522966385, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 171.21875, + "completions/mean_terminated_length": 171.21875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.4025220572948456, + "epoch": 1.5882352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019983676868800686, + "kl": 0.04178793355822563, + "learning_rate": 5.380753959580922e-07, + "loss": 0.0003, + "num_tokens": 40960546.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4041763544082642, + "sampling/importance_sampling_ratio/mean": 0.9998598098754883, + "sampling/importance_sampling_ratio/min": 0.7072054147720337, + "sampling/sampling_logp_difference/max": 0.3464341163635254, + "sampling/sampling_logp_difference/mean": 0.014135192148387432, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 246.9375, + "completions/mean_terminated_length": 246.9375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.5462402105331421, + "epoch": 1.5894607843137254, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9179610903917748, + "kl": 0.057419903576374054, + "learning_rate": 5.373650467932121e-07, + "loss": 0.0205, + "num_tokens": 40994238.0, + "reward": 0.59375, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.358890414237976, + "sampling/importance_sampling_ratio/mean": 0.9993711709976196, + "sampling/importance_sampling_ratio/min": 0.7226272821426392, + "sampling/sampling_logp_difference/max": 0.3248617649078369, + "sampling/sampling_logp_difference/mean": 0.015900490805506706, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 231.1875, + "completions/mean_terminated_length": 231.1875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.35127443075180054, + "epoch": 1.590686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020664295052024075, + "kl": 0.028000975027680397, + "learning_rate": 5.366546217796541e-07, + "loss": 0.0003, + "num_tokens": 41029946.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3271061182022095, + "sampling/importance_sampling_ratio/mean": 1.0001511573791504, + "sampling/importance_sampling_ratio/min": 0.7108889818191528, + "sampling/sampling_logp_difference/max": 0.34123897552490234, + "sampling/sampling_logp_difference/mean": 0.012899599969387054, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 214.03125, + "completions/mean_terminated_length": 214.03125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.35238945484161377, + "epoch": 1.5919117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0824990246826658, + "kl": 0.03601183369755745, + "learning_rate": 5.359441223595363e-07, + "loss": 0.0083, + "num_tokens": 41063820.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.799363613128662, + "sampling/importance_sampling_ratio/mean": 1.000187635421753, + "sampling/importance_sampling_ratio/min": 0.7322774529457092, + "sampling/sampling_logp_difference/max": 0.5874330997467041, + "sampling/sampling_logp_difference/mean": 0.013019061647355556, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 199.21875, + "completions/mean_terminated_length": 199.21875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3887077867984772, + "epoch": 1.593137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0711855415051406, + "kl": 0.044857725501060486, + "learning_rate": 5.352335499751269e-07, + "loss": -0.0105, + "num_tokens": 41092842.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4760971069335938, + "sampling/importance_sampling_ratio/mean": 1.00020170211792, + "sampling/importance_sampling_ratio/min": 0.6391717791557312, + "sampling/sampling_logp_difference/max": 0.4475820064544678, + "sampling/sampling_logp_difference/mean": 0.013103963807225227, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 249.46875, + "completions/mean_terminated_length": 249.46875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.38398146629333496, + "epoch": 1.594362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7715913046425408, + "kl": 0.0276858601719141, + "learning_rate": 5.345229060688433e-07, + "loss": 0.0218, + "num_tokens": 41128344.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6073960065841675, + "sampling/importance_sampling_ratio/mean": 1.000038981437683, + "sampling/importance_sampling_ratio/min": 0.6056225895881653, + "sampling/sampling_logp_difference/max": 0.5014982223510742, + "sampling/sampling_logp_difference/mean": 0.012683748267591, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 214.515625, + "completions/mean_terminated_length": 214.515625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.38036349415779114, + "epoch": 1.5955882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01941041468749474, + "kl": 0.03759653866291046, + "learning_rate": 5.338121920832475e-07, + "loss": 0.0003, + "num_tokens": 41159721.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4104421138763428, + "sampling/importance_sampling_ratio/mean": 1.0005254745483398, + "sampling/importance_sampling_ratio/min": 0.6971437931060791, + "sampling/sampling_logp_difference/max": 0.3607635498046875, + "sampling/sampling_logp_difference/mean": 0.014444278553128242, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 216.890625, + "completions/mean_terminated_length": 216.890625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4683217406272888, + "epoch": 1.596813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7166611596871837, + "kl": 0.06479225307703018, + "learning_rate": 5.331014094610438e-07, + "loss": -0.0064, + "num_tokens": 41188850.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.454557180404663, + "sampling/importance_sampling_ratio/mean": 0.999299168586731, + "sampling/importance_sampling_ratio/min": 0.6173462271690369, + "sampling/sampling_logp_difference/max": 0.48232531547546387, + "sampling/sampling_logp_difference/mean": 0.015371318906545639, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 194.828125, + "completions/mean_terminated_length": 194.828125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.36123284697532654, + "epoch": 1.5980392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022109711881336674, + "kl": 0.04242018610239029, + "learning_rate": 5.323905596450759e-07, + "loss": 0.0004, + "num_tokens": 41219111.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4179425239562988, + "sampling/importance_sampling_ratio/mean": 0.9995553493499756, + "sampling/importance_sampling_ratio/min": 0.5423824191093445, + "sampling/sampling_logp_difference/max": 0.6117839813232422, + "sampling/sampling_logp_difference/mean": 0.01411019079387188, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 203.65625, + "completions/mean_terminated_length": 203.65625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.408869206905365, + "epoch": 1.5992647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018898870023638714, + "kl": 0.03537284582853317, + "learning_rate": 5.31679644078324e-07, + "loss": 0.0004, + "num_tokens": 41247793.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5288374423980713, + "sampling/importance_sampling_ratio/mean": 0.9998724460601807, + "sampling/importance_sampling_ratio/min": 0.6658873558044434, + "sampling/sampling_logp_difference/max": 0.42450761795043945, + "sampling/sampling_logp_difference/mean": 0.013627918437123299, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 205.40625, + "completions/mean_terminated_length": 205.40625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.39194366335868835, + "epoch": 1.6004901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7494485877542367, + "kl": 0.044651098549366, + "learning_rate": 5.309686642039015e-07, + "loss": -0.0064, + "num_tokens": 41276507.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4754210710525513, + "sampling/importance_sampling_ratio/mean": 1.0001399517059326, + "sampling/importance_sampling_ratio/min": 0.7034305334091187, + "sampling/sampling_logp_difference/max": 0.3889434337615967, + "sampling/sampling_logp_difference/mean": 0.013582364656031132, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 182.484375, + "completions/mean_terminated_length": 182.484375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.35944098234176636, + "epoch": 1.6017156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018836484571554558, + "kl": 0.037919431924819946, + "learning_rate": 5.302576214650527e-07, + "loss": 0.0003, + "num_tokens": 41307194.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4416300058364868, + "sampling/importance_sampling_ratio/mean": 0.9997807741165161, + "sampling/importance_sampling_ratio/min": 0.6219866871833801, + "sampling/sampling_logp_difference/max": 0.4748365879058838, + "sampling/sampling_logp_difference/mean": 0.013919560238718987, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 191.484375, + "completions/mean_terminated_length": 191.484375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3720022439956665, + "epoch": 1.6029411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7226069802139616, + "kl": 0.04658423736691475, + "learning_rate": 5.295465173051491e-07, + "loss": 0.0121, + "num_tokens": 41337849.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.3114043474197388, + "sampling/importance_sampling_ratio/mean": 0.9997433423995972, + "sampling/importance_sampling_ratio/min": 0.6264688968658447, + "sampling/sampling_logp_difference/max": 0.46765613555908203, + "sampling/sampling_logp_difference/mean": 0.013154737651348114, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.45763248205184937, + "epoch": 1.6041666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7688062162783167, + "kl": 0.05524931475520134, + "learning_rate": 5.288353531676873e-07, + "loss": 0.012, + "num_tokens": 41368129.0, + "reward": -0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.414183497428894, + "sampling/importance_sampling_ratio/mean": 0.9998292922973633, + "sampling/importance_sampling_ratio/min": 0.7510547041893005, + "sampling/sampling_logp_difference/max": 0.34655237197875977, + "sampling/sampling_logp_difference/mean": 0.014586166478693485, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 156.71875, + "completions/mean_terminated_length": 156.71875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.28859561681747437, + "epoch": 1.6053921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02383512103423003, + "kl": 0.04662370681762695, + "learning_rate": 5.281241304962852e-07, + "loss": 0.0004, + "num_tokens": 41394447.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.565940022468567, + "sampling/importance_sampling_ratio/mean": 1.000131607055664, + "sampling/importance_sampling_ratio/min": 0.6829407811164856, + "sampling/sampling_logp_difference/max": 0.448486328125, + "sampling/sampling_logp_difference/mean": 0.01221737265586853, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 147.078125, + "completions/mean_terminated_length": 147.078125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.4415820837020874, + "epoch": 1.6066176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9719135272575133, + "kl": 0.06833760440349579, + "learning_rate": 5.2741285073468e-07, + "loss": -0.0145, + "num_tokens": 41430052.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6259056329727173, + "sampling/importance_sampling_ratio/mean": 0.9997236728668213, + "sampling/importance_sampling_ratio/min": 0.6579318046569824, + "sampling/sampling_logp_difference/max": 0.4860649108886719, + "sampling/sampling_logp_difference/mean": 0.016034625470638275, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 196.53125, + "completions/mean_terminated_length": 196.53125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4524112641811371, + "epoch": 1.607843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4441698959929314, + "kl": 0.05132122337818146, + "learning_rate": 5.267015153267245e-07, + "loss": -0.0446, + "num_tokens": 41462454.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6315703392028809, + "sampling/importance_sampling_ratio/mean": 0.9993882179260254, + "sampling/importance_sampling_ratio/min": 0.6724433898925781, + "sampling/sampling_logp_difference/max": 0.48954296112060547, + "sampling/sampling_logp_difference/mean": 0.015758004039525986, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 160.84375, + "completions/mean_terminated_length": 160.84375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3407145142555237, + "epoch": 1.6090686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9106068174314087, + "kl": 0.041749805212020874, + "learning_rate": 5.259901257163844e-07, + "loss": -0.0003, + "num_tokens": 41489292.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6275004148483276, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 0.7483863234519958, + "sampling/sampling_logp_difference/max": 0.4870452880859375, + "sampling/sampling_logp_difference/mean": 0.012471785768866539, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 139.921875, + "completions/mean_terminated_length": 139.921875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.4330427348613739, + "epoch": 1.6102941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0283704013003629, + "kl": 0.048846468329429626, + "learning_rate": 5.252786833477358e-07, + "loss": 0.0005, + "num_tokens": 41517879.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3068888187408447, + "sampling/importance_sampling_ratio/mean": 1.0002931356430054, + "sampling/importance_sampling_ratio/min": 0.6772646307945251, + "sampling/sampling_logp_difference/max": 0.3896932601928711, + "sampling/sampling_logp_difference/mean": 0.014600535854697227, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 245.0625, + "completions/mean_terminated_length": 245.0625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.39107346534729004, + "epoch": 1.6115196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6438024775641634, + "kl": 0.028462667018175125, + "learning_rate": 5.245671896649612e-07, + "loss": -0.0296, + "num_tokens": 41552587.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.449144721031189, + "sampling/importance_sampling_ratio/mean": 0.9995109438896179, + "sampling/importance_sampling_ratio/min": 0.4164716899394989, + "sampling/sampling_logp_difference/max": 0.87593674659729, + "sampling/sampling_logp_difference/mean": 0.012780715711414814, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 193.015625, + "completions/mean_terminated_length": 193.015625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.419755756855011, + "epoch": 1.6127450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01988272897693721, + "kl": 0.03518611937761307, + "learning_rate": 5.23855646112348e-07, + "loss": 0.0003, + "num_tokens": 41580236.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4417973756790161, + "sampling/importance_sampling_ratio/mean": 0.999925434589386, + "sampling/importance_sampling_ratio/min": 0.7690115571022034, + "sampling/sampling_logp_difference/max": 0.3658905029296875, + "sampling/sampling_logp_difference/mean": 0.014504168182611465, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 201.046875, + "completions/mean_terminated_length": 201.046875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.45324522256851196, + "epoch": 1.6139705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027969975864834386, + "kl": 0.044220201671123505, + "learning_rate": 5.231440541342845e-07, + "loss": 0.0004, + "num_tokens": 41608607.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3302301168441772, + "sampling/importance_sampling_ratio/mean": 0.9996073246002197, + "sampling/importance_sampling_ratio/min": 0.6945046782493591, + "sampling/sampling_logp_difference/max": 0.36455631256103516, + "sampling/sampling_logp_difference/mean": 0.015408453531563282, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 222.09375, + "completions/mean_terminated_length": 222.09375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3381379246711731, + "epoch": 1.6151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016143746427933273, + "kl": 0.03346278890967369, + "learning_rate": 5.224324151752575e-07, + "loss": 0.0003, + "num_tokens": 41643205.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4749078750610352, + "sampling/importance_sampling_ratio/mean": 0.9998982548713684, + "sampling/importance_sampling_ratio/min": 0.7725921869277954, + "sampling/sampling_logp_difference/max": 0.3885955810546875, + "sampling/sampling_logp_difference/mean": 0.011889193207025528, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 216.828125, + "completions/mean_terminated_length": 216.828125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.46138995885849, + "epoch": 1.616421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3058688689762155, + "kl": 0.0376376174390316, + "learning_rate": 5.217207306798487e-07, + "loss": -0.0307, + "num_tokens": 41674202.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.4060932397842407, + "sampling/importance_sampling_ratio/mean": 0.9996814131736755, + "sampling/importance_sampling_ratio/min": 0.5261827707290649, + "sampling/sampling_logp_difference/max": 0.6421066522598267, + "sampling/sampling_logp_difference/mean": 0.01577083021402359, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 179.171875, + "completions/mean_terminated_length": 179.171875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.374620258808136, + "epoch": 1.6176470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023526682651923623, + "kl": 0.03740628436207771, + "learning_rate": 5.210090020927326e-07, + "loss": 0.0004, + "num_tokens": 41703829.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4194133281707764, + "sampling/importance_sampling_ratio/mean": 0.9996632933616638, + "sampling/importance_sampling_ratio/min": 0.6773514151573181, + "sampling/sampling_logp_difference/max": 0.389565110206604, + "sampling/sampling_logp_difference/mean": 0.014365723356604576, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 187.84375, + "completions/mean_terminated_length": 187.84375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4308626055717468, + "epoch": 1.6188725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021660701054459646, + "kl": 0.039525438100099564, + "learning_rate": 5.202972308586735e-07, + "loss": 0.0004, + "num_tokens": 41738619.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.607591986656189, + "sampling/importance_sampling_ratio/mean": 1.0002162456512451, + "sampling/importance_sampling_ratio/min": 0.7422322630882263, + "sampling/sampling_logp_difference/max": 0.47473740577697754, + "sampling/sampling_logp_difference/mean": 0.015454979613423347, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 173.640625, + "completions/mean_terminated_length": 173.640625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.41735368967056274, + "epoch": 1.6200980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022611943103419407, + "kl": 0.035894010215997696, + "learning_rate": 5.195854184225213e-07, + "loss": 0.0004, + "num_tokens": 41768004.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2847273349761963, + "sampling/importance_sampling_ratio/mean": 0.999764084815979, + "sampling/importance_sampling_ratio/min": 0.7095668315887451, + "sampling/sampling_logp_difference/max": 0.34310054779052734, + "sampling/sampling_logp_difference/mean": 0.014909334480762482, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 186.296875, + "completions/mean_terminated_length": 186.296875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.36383774876594543, + "epoch": 1.6213235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019477232921706708, + "kl": 0.03461875766515732, + "learning_rate": 5.188735662292107e-07, + "loss": 0.0003, + "num_tokens": 41795767.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.663252830505371, + "sampling/importance_sampling_ratio/mean": 1.0004137754440308, + "sampling/importance_sampling_ratio/min": 0.6158820986747742, + "sampling/sampling_logp_difference/max": 0.5087752342224121, + "sampling/sampling_logp_difference/mean": 0.014002447947859764, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 164.6875, + "completions/mean_terminated_length": 164.6875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.41262656450271606, + "epoch": 1.6225490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8563376392888821, + "kl": 0.05176469683647156, + "learning_rate": 5.181616757237561e-07, + "loss": 0.0065, + "num_tokens": 41821475.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4510161876678467, + "sampling/importance_sampling_ratio/mean": 1.0002225637435913, + "sampling/importance_sampling_ratio/min": 0.6574416756629944, + "sampling/sampling_logp_difference/max": 0.4193992614746094, + "sampling/sampling_logp_difference/mean": 0.015879232436418533, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 192.453125, + "completions/mean_terminated_length": 192.453125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.4138108193874359, + "epoch": 1.6237745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030717015522757213, + "kl": 0.06454174965620041, + "learning_rate": 5.174497483512505e-07, + "loss": 0.0007, + "num_tokens": 41851264.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3186839818954468, + "sampling/importance_sampling_ratio/mean": 1.0005762577056885, + "sampling/importance_sampling_ratio/min": 0.6449248790740967, + "sampling/sampling_logp_difference/max": 0.4386214017868042, + "sampling/sampling_logp_difference/mean": 0.015171162784099579, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 204.359375, + "completions/mean_terminated_length": 204.359375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3680081367492676, + "epoch": 1.625, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0144647496447934, + "kl": 0.04304346442222595, + "learning_rate": 5.167377855568612e-07, + "loss": 0.073, + "num_tokens": 41885303.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5975630283355713, + "sampling/importance_sampling_ratio/mean": 1.0000262260437012, + "sampling/importance_sampling_ratio/min": 0.6099649667739868, + "sampling/sampling_logp_difference/max": 0.4943537712097168, + "sampling/sampling_logp_difference/mean": 0.01340695470571518, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 181.953125, + "completions/mean_terminated_length": 181.953125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.4094926118850708, + "epoch": 1.6262254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0132436576303403, + "kl": 0.038716308772563934, + "learning_rate": 5.160257887858277e-07, + "loss": 0.0322, + "num_tokens": 41922756.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5303932428359985, + "sampling/importance_sampling_ratio/mean": 1.0002155303955078, + "sampling/importance_sampling_ratio/min": 0.7565470933914185, + "sampling/sampling_logp_difference/max": 0.4255247116088867, + "sampling/sampling_logp_difference/mean": 0.015106882899999619, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 165.234375, + "completions/mean_terminated_length": 165.234375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.37199485301971436, + "epoch": 1.6274509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02053875434826521, + "kl": 0.04270268231630325, + "learning_rate": 5.15313759483458e-07, + "loss": 0.0004, + "num_tokens": 41948563.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3621101379394531, + "sampling/importance_sampling_ratio/mean": 0.9995244145393372, + "sampling/importance_sampling_ratio/min": 0.6622359752655029, + "sampling/sampling_logp_difference/max": 0.4121333360671997, + "sampling/sampling_logp_difference/mean": 0.014357365667819977, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 168.140625, + "completions/mean_terminated_length": 168.140625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.5480635762214661, + "epoch": 1.6286764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8991249863607076, + "kl": 0.06932510435581207, + "learning_rate": 5.146016990951268e-07, + "loss": -0.0296, + "num_tokens": 41978252.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3300795555114746, + "sampling/importance_sampling_ratio/mean": 0.9992871880531311, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.01712382212281227, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 181.640625, + "completions/mean_terminated_length": 181.640625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3745878338813782, + "epoch": 1.6299019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8996782301834043, + "kl": 0.04495788365602493, + "learning_rate": 5.138896090662714e-07, + "loss": 0.0197, + "num_tokens": 42010021.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3376659154891968, + "sampling/importance_sampling_ratio/mean": 1.0002796649932861, + "sampling/importance_sampling_ratio/min": 0.6891587972640991, + "sampling/sampling_logp_difference/max": 0.37228357791900635, + "sampling/sampling_logp_difference/mean": 0.01386922039091587, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 163.671875, + "completions/mean_terminated_length": 163.671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.4854891896247864, + "epoch": 1.6311274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8256542939191935, + "kl": 0.060473497956991196, + "learning_rate": 5.131774908423898e-07, + "loss": 0.0016, + "num_tokens": 42035952.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6098212003707886, + "sampling/importance_sampling_ratio/mean": 1.0002360343933105, + "sampling/importance_sampling_ratio/min": 0.6633803248405457, + "sampling/sampling_logp_difference/max": 0.4761230945587158, + "sampling/sampling_logp_difference/mean": 0.01714945212006569, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 164.828125, + "completions/mean_terminated_length": 164.828125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3972048759460449, + "epoch": 1.6323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02213680524369021, + "kl": 0.04135869815945625, + "learning_rate": 5.124653458690365e-07, + "loss": 0.0004, + "num_tokens": 42064821.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6158479452133179, + "sampling/importance_sampling_ratio/mean": 1.0002546310424805, + "sampling/importance_sampling_ratio/min": 0.7061274647712708, + "sampling/sampling_logp_difference/max": 0.4798598289489746, + "sampling/sampling_logp_difference/mean": 0.0156618170440197, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 179.4375, + "completions/mean_terminated_length": 179.4375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.37442547082901, + "epoch": 1.633578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02273687112248634, + "kl": 0.03795505315065384, + "learning_rate": 5.117531755918207e-07, + "loss": 0.0004, + "num_tokens": 42092401.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4187004566192627, + "sampling/importance_sampling_ratio/mean": 1.000243902206421, + "sampling/importance_sampling_ratio/min": 0.700717031955719, + "sampling/sampling_logp_difference/max": 0.3556511402130127, + "sampling/sampling_logp_difference/mean": 0.014409808441996574, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 205.375, + "completions/mean_terminated_length": 205.375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.40468767285346985, + "epoch": 1.6348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022758320385781947, + "kl": 0.03673470765352249, + "learning_rate": 5.110409814564031e-07, + "loss": 0.0003, + "num_tokens": 42128281.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.439674973487854, + "sampling/importance_sampling_ratio/mean": 0.9996200799942017, + "sampling/importance_sampling_ratio/min": 0.6193218231201172, + "sampling/sampling_logp_difference/max": 0.47913026809692383, + "sampling/sampling_logp_difference/mean": 0.014988134615123272, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 175.53125, + "completions/mean_terminated_length": 175.53125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4532517194747925, + "epoch": 1.6360294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8069309298947697, + "kl": 0.042012833058834076, + "learning_rate": 5.103287649084926e-07, + "loss": 0.0042, + "num_tokens": 42156587.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.395279884338379, + "sampling/importance_sampling_ratio/mean": 0.9997216463088989, + "sampling/importance_sampling_ratio/min": 0.6385092735290527, + "sampling/sampling_logp_difference/max": 0.44861912727355957, + "sampling/sampling_logp_difference/mean": 0.015587395057082176, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 153.59375, + "completions/mean_terminated_length": 153.59375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.33972054719924927, + "epoch": 1.6372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021105598295994576, + "kl": 0.03662524372339249, + "learning_rate": 5.096165273938435e-07, + "loss": 0.0003, + "num_tokens": 42183297.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3670557737350464, + "sampling/importance_sampling_ratio/mean": 1.0000076293945312, + "sampling/importance_sampling_ratio/min": 0.6033264994621277, + "sampling/sampling_logp_difference/max": 0.5052967071533203, + "sampling/sampling_logp_difference/mean": 0.013634255155920982, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 183.359375, + "completions/mean_terminated_length": 183.359375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.4015965759754181, + "epoch": 1.6384803921568627, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8156907350196695, + "kl": 0.05624488741159439, + "learning_rate": 5.089042703582533e-07, + "loss": -0.0005, + "num_tokens": 42212376.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4057984352111816, + "sampling/importance_sampling_ratio/mean": 1.00014066696167, + "sampling/importance_sampling_ratio/min": 0.6264901161193848, + "sampling/sampling_logp_difference/max": 0.4676222801208496, + "sampling/sampling_logp_difference/mean": 0.014927358366549015, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 198.09375, + "completions/mean_terminated_length": 198.09375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.45356786251068115, + "epoch": 1.6397058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.813557953406789, + "kl": 0.03787259757518768, + "learning_rate": 5.081919952475583e-07, + "loss": -0.0005, + "num_tokens": 42248446.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5267914533615112, + "sampling/importance_sampling_ratio/mean": 1.0002291202545166, + "sampling/importance_sampling_ratio/min": 0.41856861114501953, + "sampling/sampling_logp_difference/max": 0.8709144592285156, + "sampling/sampling_logp_difference/mean": 0.015977103263139725, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 184.875, + "completions/mean_terminated_length": 184.875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.40405213832855225, + "epoch": 1.6409313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022911835051461407, + "kl": 0.040140967816114426, + "learning_rate": 5.074797035076318e-07, + "loss": 0.0004, + "num_tokens": 42274950.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4439164400100708, + "sampling/importance_sampling_ratio/mean": 0.9999382495880127, + "sampling/importance_sampling_ratio/min": 0.6416292786598206, + "sampling/sampling_logp_difference/max": 0.4437446594238281, + "sampling/sampling_logp_difference/mean": 0.01622859016060829, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 191.609375, + "completions/mean_terminated_length": 191.609375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.367656946182251, + "epoch": 1.642156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9209673126351376, + "kl": 0.03389076888561249, + "learning_rate": 5.067673965843812e-07, + "loss": -0.0071, + "num_tokens": 42303917.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.620739459991455, + "sampling/importance_sampling_ratio/mean": 1.0002752542495728, + "sampling/importance_sampling_ratio/min": 0.679011881351471, + "sampling/sampling_logp_difference/max": 0.4828824996948242, + "sampling/sampling_logp_difference/mean": 0.014965730719268322, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 212.75, + "completions/mean_terminated_length": 212.75, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3058163821697235, + "epoch": 1.6433823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027913752649485868, + "kl": 0.03194785118103027, + "learning_rate": 5.060550759237441e-07, + "loss": 0.0003, + "num_tokens": 42334013.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6105678081512451, + "sampling/importance_sampling_ratio/mean": 1.0004963874816895, + "sampling/importance_sampling_ratio/min": 0.6547014117240906, + "sampling/sampling_logp_difference/max": 0.47658681869506836, + "sampling/sampling_logp_difference/mean": 0.012903638184070587, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 176.984375, + "completions/mean_terminated_length": 176.984375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.36098748445510864, + "epoch": 1.6446078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018980144265438367, + "kl": 0.026891544461250305, + "learning_rate": 5.053427429716866e-07, + "loss": 0.0003, + "num_tokens": 42364940.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2840707302093506, + "sampling/importance_sampling_ratio/mean": 0.9999522566795349, + "sampling/importance_sampling_ratio/min": 0.49538055062294006, + "sampling/sampling_logp_difference/max": 0.7024290561676025, + "sampling/sampling_logp_difference/mean": 0.014355067163705826, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 188.796875, + "completions/mean_terminated_length": 188.796875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.36714956164360046, + "epoch": 1.6458333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0230623433289197, + "kl": 0.03373267501592636, + "learning_rate": 5.046303991741993e-07, + "loss": 0.0171, + "num_tokens": 42394495.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000599980354309, + "sampling/importance_sampling_ratio/min": 0.6436612606048584, + "sampling/sampling_logp_difference/max": 0.7042334079742432, + "sampling/sampling_logp_difference/mean": 0.014708654955029488, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 210.96875, + "completions/mean_terminated_length": 210.96875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.49650660157203674, + "epoch": 1.6470588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.832054013171931, + "kl": 0.05175505578517914, + "learning_rate": 5.039180459772949e-07, + "loss": -0.0276, + "num_tokens": 42425821.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6660652160644531, + "sampling/importance_sampling_ratio/mean": 0.9998347759246826, + "sampling/importance_sampling_ratio/min": 0.7393490076065063, + "sampling/sampling_logp_difference/max": 0.5104646682739258, + "sampling/sampling_logp_difference/mean": 0.018163859844207764, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 126.96875, + "completions/mean_terminated_length": 126.96875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.35790514945983887, + "epoch": 1.6482843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050137352484596526, + "kl": 0.053024642169475555, + "learning_rate": 5.032056848270056e-07, + "loss": 0.0005, + "num_tokens": 42448571.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4242204427719116, + "sampling/importance_sampling_ratio/mean": 1.0000522136688232, + "sampling/importance_sampling_ratio/min": 0.6195096373558044, + "sampling/sampling_logp_difference/max": 0.47882699966430664, + "sampling/sampling_logp_difference/mean": 0.01624734327197075, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 163.859375, + "completions/mean_terminated_length": 163.859375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.310820996761322, + "epoch": 1.6495098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026732463611402052, + "kl": 0.0334944874048233, + "learning_rate": 5.02493317169379e-07, + "loss": 0.0003, + "num_tokens": 42473746.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.318579077720642, + "sampling/importance_sampling_ratio/mean": 0.9997299313545227, + "sampling/importance_sampling_ratio/min": 0.5049257874488831, + "sampling/sampling_logp_difference/max": 0.6833438873291016, + "sampling/sampling_logp_difference/mean": 0.013605006039142609, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 173.984375, + "completions/mean_terminated_length": 173.984375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.35105830430984497, + "epoch": 1.6507352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025926797408567508, + "kl": 0.03746583312749863, + "learning_rate": 5.017809444504767e-07, + "loss": 0.0004, + "num_tokens": 42504097.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.507704734802246, + "sampling/importance_sampling_ratio/mean": 0.9996305704116821, + "sampling/importance_sampling_ratio/min": 0.5362950563430786, + "sampling/sampling_logp_difference/max": 0.6230708360671997, + "sampling/sampling_logp_difference/mean": 0.014887738972902298, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 210.390625, + "completions/mean_terminated_length": 210.390625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3304094076156616, + "epoch": 1.6519607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020346980029235143, + "kl": 0.026339039206504822, + "learning_rate": 5.010685681163698e-07, + "loss": 0.0003, + "num_tokens": 42537594.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5016820430755615, + "sampling/importance_sampling_ratio/mean": 1.0007579326629639, + "sampling/importance_sampling_ratio/min": 0.6771543622016907, + "sampling/sampling_logp_difference/max": 0.40658581256866455, + "sampling/sampling_logp_difference/mean": 0.012531211599707603, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 149.703125, + "completions/mean_terminated_length": 149.703125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.394240140914917, + "epoch": 1.653186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03682560528055482, + "kl": 0.03671746701002121, + "learning_rate": 5.003561896131374e-07, + "loss": 0.0004, + "num_tokens": 42568231.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.401877522468567, + "sampling/importance_sampling_ratio/mean": 1.0008456707000732, + "sampling/importance_sampling_ratio/min": 0.6810407638549805, + "sampling/sampling_logp_difference/max": 0.38413310050964355, + "sampling/sampling_logp_difference/mean": 0.015878338366746902, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 177.546875, + "completions/mean_terminated_length": 177.546875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.35006511211395264, + "epoch": 1.6544117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02562559019198169, + "kl": 0.027174564078450203, + "learning_rate": 4.996438103868625e-07, + "loss": 0.0003, + "num_tokens": 42598442.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3723374605178833, + "sampling/importance_sampling_ratio/mean": 1.0002684593200684, + "sampling/importance_sampling_ratio/min": 0.7244256138801575, + "sampling/sampling_logp_difference/max": 0.3223762512207031, + "sampling/sampling_logp_difference/mean": 0.013350119814276695, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 197.5625, + "completions/mean_terminated_length": 197.5625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3624252378940582, + "epoch": 1.655637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6550030764097664, + "kl": 0.038706012070178986, + "learning_rate": 4.989314318836302e-07, + "loss": 0.0107, + "num_tokens": 42627646.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5278007984161377, + "sampling/importance_sampling_ratio/mean": 0.9998214840888977, + "sampling/importance_sampling_ratio/min": 0.6209051012992859, + "sampling/sampling_logp_difference/max": 0.4765770435333252, + "sampling/sampling_logp_difference/mean": 0.01568310149013996, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 145.078125, + "completions/mean_terminated_length": 145.078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.35354289412498474, + "epoch": 1.656862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0223500839934487, + "kl": 0.04581998288631439, + "learning_rate": 4.982190555495235e-07, + "loss": 0.0, + "num_tokens": 42650467.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.54673433303833, + "sampling/importance_sampling_ratio/mean": 1.0001983642578125, + "sampling/importance_sampling_ratio/min": 0.632348358631134, + "sampling/sampling_logp_difference/max": 0.4583148956298828, + "sampling/sampling_logp_difference/mean": 0.01473325490951538, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 162.265625, + "completions/mean_terminated_length": 162.265625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.27945151925086975, + "epoch": 1.6580882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031127494861176195, + "kl": 0.027711743488907814, + "learning_rate": 4.975066828306209e-07, + "loss": 0.0003, + "num_tokens": 42676436.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6095050573349, + "sampling/importance_sampling_ratio/mean": 1.0007215738296509, + "sampling/importance_sampling_ratio/min": 0.632074773311615, + "sampling/sampling_logp_difference/max": 0.47592663764953613, + "sampling/sampling_logp_difference/mean": 0.012972263619303703, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 161.734375, + "completions/mean_terminated_length": 161.734375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.29871469736099243, + "epoch": 1.659313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10170850044489836, + "kl": 0.04948166757822037, + "learning_rate": 4.967943151729944e-07, + "loss": 0.0005, + "num_tokens": 42701203.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4357198476791382, + "sampling/importance_sampling_ratio/mean": 1.000241756439209, + "sampling/importance_sampling_ratio/min": 0.6487295627593994, + "sampling/sampling_logp_difference/max": 0.43273937702178955, + "sampling/sampling_logp_difference/mean": 0.014093228615820408, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 148.046875, + "completions/mean_terminated_length": 148.046875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.2653568983078003, + "epoch": 1.6605392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8555905687961668, + "kl": 0.028595855459570885, + "learning_rate": 4.96081954022705e-07, + "loss": -0.0465, + "num_tokens": 42725318.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.508622169494629, + "sampling/importance_sampling_ratio/mean": 0.9999791979789734, + "sampling/importance_sampling_ratio/min": 0.6793181896209717, + "sampling/sampling_logp_difference/max": 0.4111967086791992, + "sampling/sampling_logp_difference/mean": 0.012172574177384377, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 162.734375, + "completions/mean_terminated_length": 162.734375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3512003421783447, + "epoch": 1.6617647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9904731342825035, + "kl": 0.05167574808001518, + "learning_rate": 4.953696008258008e-07, + "loss": 0.0539, + "num_tokens": 42750789.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4245202541351318, + "sampling/importance_sampling_ratio/mean": 1.0001580715179443, + "sampling/importance_sampling_ratio/min": 0.7033551335334778, + "sampling/sampling_logp_difference/max": 0.3538351058959961, + "sampling/sampling_logp_difference/mean": 0.01564112678170204, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 141.546875, + "completions/mean_terminated_length": 141.546875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.36807769536972046, + "epoch": 1.6629901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06052259875942137, + "kl": 0.05655297264456749, + "learning_rate": 4.946572570283134e-07, + "loss": 0.0005, + "num_tokens": 42776232.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5521773099899292, + "sampling/importance_sampling_ratio/mean": 0.9987242221832275, + "sampling/importance_sampling_ratio/min": 0.6245699524879456, + "sampling/sampling_logp_difference/max": 0.4706919193267822, + "sampling/sampling_logp_difference/mean": 0.017455413937568665, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 161.265625, + "completions/mean_terminated_length": 161.265625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3822873532772064, + "epoch": 1.6642156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8381209088210724, + "kl": 0.05772514268755913, + "learning_rate": 4.939449240762558e-07, + "loss": 0.0241, + "num_tokens": 42803305.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3750104904174805, + "sampling/importance_sampling_ratio/mean": 1.000871181488037, + "sampling/importance_sampling_ratio/min": 0.6172491908073425, + "sampling/sampling_logp_difference/max": 0.4824824333190918, + "sampling/sampling_logp_difference/mean": 0.01507329661399126, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 152.421875, + "completions/mean_terminated_length": 152.421875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3023286461830139, + "epoch": 1.6654411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035516933832001996, + "kl": 0.03554411977529526, + "learning_rate": 4.932326034156189e-07, + "loss": 0.0003, + "num_tokens": 42832164.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4886304140090942, + "sampling/importance_sampling_ratio/mean": 1.000510334968567, + "sampling/importance_sampling_ratio/min": 0.37227627635002136, + "sampling/sampling_logp_difference/max": 0.9881191253662109, + "sampling/sampling_logp_difference/mean": 0.013887686654925346, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 156.859375, + "completions/mean_terminated_length": 156.859375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.25837552547454834, + "epoch": 1.6666666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03076134315505385, + "kl": 0.03431840240955353, + "learning_rate": 4.925202964923683e-07, + "loss": 0.0003, + "num_tokens": 42856763.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7259713411331177, + "sampling/importance_sampling_ratio/mean": 1.001096248626709, + "sampling/importance_sampling_ratio/min": 0.6386511325836182, + "sampling/sampling_logp_difference/max": 0.5457899570465088, + "sampling/sampling_logp_difference/mean": 0.012383817695081234, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 171.484375, + "completions/mean_terminated_length": 171.484375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3070116639137268, + "epoch": 1.6678921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0349437000416922, + "kl": 0.03264414519071579, + "learning_rate": 4.918080047524417e-07, + "loss": 0.0003, + "num_tokens": 42882778.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5539450645446777, + "sampling/importance_sampling_ratio/mean": 1.0001025199890137, + "sampling/importance_sampling_ratio/min": 0.7177270650863647, + "sampling/sampling_logp_difference/max": 0.4407968521118164, + "sampling/sampling_logp_difference/mean": 0.01374280359596014, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 126.6875, + "completions/mean_terminated_length": 126.6875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.34527677297592163, + "epoch": 1.6691176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08364935046106986, + "kl": 0.06286908686161041, + "learning_rate": 4.910957296417467e-07, + "loss": 0.0006, + "num_tokens": 42903158.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4781303405761719, + "sampling/importance_sampling_ratio/mean": 1.0003399848937988, + "sampling/importance_sampling_ratio/min": 0.5820320248603821, + "sampling/sampling_logp_difference/max": 0.5412298440933228, + "sampling/sampling_logp_difference/mean": 0.016989272087812424, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 200.21875, + "completions/mean_terminated_length": 200.21875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.396662175655365, + "epoch": 1.670343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8406850862689759, + "kl": 0.05830325931310654, + "learning_rate": 4.903834726061564e-07, + "loss": 0.0501, + "num_tokens": 42937284.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3477368354797363, + "sampling/importance_sampling_ratio/mean": 1.0003334283828735, + "sampling/importance_sampling_ratio/min": 0.6876862645149231, + "sampling/sampling_logp_difference/max": 0.374422550201416, + "sampling/sampling_logp_difference/mean": 0.015416143462061882, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 187.96875, + "completions/mean_terminated_length": 187.96875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.36734241247177124, + "epoch": 1.6715686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03472094724967518, + "kl": 0.03748122230172157, + "learning_rate": 4.896712350915074e-07, + "loss": 0.0004, + "num_tokens": 42975026.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8012819290161133, + "sampling/importance_sampling_ratio/mean": 1.0005528926849365, + "sampling/importance_sampling_ratio/min": 0.6894164085388184, + "sampling/sampling_logp_difference/max": 0.588498592376709, + "sampling/sampling_logp_difference/mean": 0.014278611168265343, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 172.921875, + "completions/mean_terminated_length": 172.921875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.27913814783096313, + "epoch": 1.6727941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7883877875965949, + "kl": 0.040285248309373856, + "learning_rate": 4.889590185435969e-07, + "loss": -0.0038, + "num_tokens": 43005197.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.48959219455719, + "sampling/importance_sampling_ratio/mean": 0.9998310208320618, + "sampling/importance_sampling_ratio/min": 0.6181638836860657, + "sampling/sampling_logp_difference/max": 0.481001615524292, + "sampling/sampling_logp_difference/mean": 0.012834897264838219, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 173.5625, + "completions/mean_terminated_length": 173.5625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.5182405710220337, + "epoch": 1.6740196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1479215998376797, + "kl": 0.06698828935623169, + "learning_rate": 4.882468244081792e-07, + "loss": 0.0193, + "num_tokens": 43039953.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.487452745437622, + "sampling/importance_sampling_ratio/mean": 0.9995392560958862, + "sampling/importance_sampling_ratio/min": 0.6213628053665161, + "sampling/sampling_logp_difference/max": 0.47584009170532227, + "sampling/sampling_logp_difference/mean": 0.01910814270377159, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 210.171875, + "completions/mean_terminated_length": 210.171875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.2507001459598541, + "epoch": 1.6752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02484434768268435, + "kl": 0.025661654770374298, + "learning_rate": 4.875346541309636e-07, + "loss": 0.0002, + "num_tokens": 43071852.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5987184047698975, + "sampling/importance_sampling_ratio/mean": 1.000409483909607, + "sampling/importance_sampling_ratio/min": 0.6142013669013977, + "sampling/sampling_logp_difference/max": 0.48743247985839844, + "sampling/sampling_logp_difference/mean": 0.011608692817389965, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 205.109375, + "completions/mean_terminated_length": 205.109375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.36860013008117676, + "epoch": 1.6764705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.942880839334349, + "kl": 0.054536305367946625, + "learning_rate": 4.868225091576102e-07, + "loss": -0.0101, + "num_tokens": 43103187.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.3728216886520386, + "sampling/importance_sampling_ratio/mean": 1.0003598928451538, + "sampling/importance_sampling_ratio/min": 0.6970885992050171, + "sampling/sampling_logp_difference/max": 0.3608427047729492, + "sampling/sampling_logp_difference/mean": 0.01367130409926176, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 146.859375, + "completions/mean_terminated_length": 146.859375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3172333240509033, + "epoch": 1.6776960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.067249806327941, + "kl": 0.0470583513379097, + "learning_rate": 4.861103909337285e-07, + "loss": 0.0005, + "num_tokens": 43131114.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.491654872894287, + "sampling/importance_sampling_ratio/mean": 0.9995778799057007, + "sampling/importance_sampling_ratio/min": 0.48319175839424133, + "sampling/sampling_logp_difference/max": 0.7273416519165039, + "sampling/sampling_logp_difference/mean": 0.015247553586959839, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 148.46875, + "completions/mean_terminated_length": 148.46875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3860611021518707, + "epoch": 1.678921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0156624165494141, + "kl": 0.04295976832509041, + "learning_rate": 4.853983009048732e-07, + "loss": 0.0092, + "num_tokens": 43161064.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9391186237335205, + "sampling/importance_sampling_ratio/mean": 1.0002822875976562, + "sampling/importance_sampling_ratio/min": 0.6218687891960144, + "sampling/sampling_logp_difference/max": 0.6622335910797119, + "sampling/sampling_logp_difference/mean": 0.016975287348031998, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 162.328125, + "completions/mean_terminated_length": 162.328125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.45282042026519775, + "epoch": 1.6801470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6421957872690638, + "kl": 0.06159253418445587, + "learning_rate": 4.84686240516542e-07, + "loss": 0.0161, + "num_tokens": 43188173.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6155743598937988, + "sampling/importance_sampling_ratio/mean": 0.9999287128448486, + "sampling/importance_sampling_ratio/min": 0.7227627038955688, + "sampling/sampling_logp_difference/max": 0.4796905517578125, + "sampling/sampling_logp_difference/mean": 0.01766914874315262, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 170.671875, + "completions/mean_terminated_length": 170.671875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.28264009952545166, + "epoch": 1.6813725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033847431817872406, + "kl": 0.026995070278644562, + "learning_rate": 4.839742112141724e-07, + "loss": 0.0003, + "num_tokens": 43216984.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6612284183502197, + "sampling/importance_sampling_ratio/mean": 1.0005111694335938, + "sampling/importance_sampling_ratio/min": 0.7724306583404541, + "sampling/sampling_logp_difference/max": 0.5075573921203613, + "sampling/sampling_logp_difference/mean": 0.012486159801483154, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 153.453125, + "completions/mean_terminated_length": 153.453125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3153635859489441, + "epoch": 1.6825980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035374205418365584, + "kl": 0.02962607890367508, + "learning_rate": 4.832622144431388e-07, + "loss": 0.0003, + "num_tokens": 43245573.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5984976291656494, + "sampling/importance_sampling_ratio/mean": 0.9990425109863281, + "sampling/importance_sampling_ratio/min": 0.6164947152137756, + "sampling/sampling_logp_difference/max": 0.4837055206298828, + "sampling/sampling_logp_difference/mean": 0.014347722753882408, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 192.71875, + "completions/mean_terminated_length": 192.71875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.40542715787887573, + "epoch": 1.6838235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02828922354908403, + "kl": 0.03110560029745102, + "learning_rate": 4.825502516487496e-07, + "loss": 0.0003, + "num_tokens": 43277875.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4067785739898682, + "sampling/importance_sampling_ratio/mean": 0.9995977878570557, + "sampling/importance_sampling_ratio/min": 0.6156638860702515, + "sampling/sampling_logp_difference/max": 0.48505401611328125, + "sampling/sampling_logp_difference/mean": 0.01650063879787922, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 198.046875, + "completions/mean_terminated_length": 198.046875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.425992488861084, + "epoch": 1.6850490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03407945002471821, + "kl": 0.043031804263591766, + "learning_rate": 4.818383242762439e-07, + "loss": 0.0004, + "num_tokens": 43315366.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6155577898025513, + "sampling/importance_sampling_ratio/mean": 1.0000079870224, + "sampling/importance_sampling_ratio/min": 0.6148726940155029, + "sampling/sampling_logp_difference/max": 0.4863399863243103, + "sampling/sampling_logp_difference/mean": 0.01564926654100418, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 202.140625, + "completions/mean_terminated_length": 202.140625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.37396860122680664, + "epoch": 1.6862745098039216, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2579806973084713, + "kl": 0.03807292878627777, + "learning_rate": 4.811264337707894e-07, + "loss": -0.0198, + "num_tokens": 43343487.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7321727275848389, + "sampling/importance_sampling_ratio/mean": 0.9996993541717529, + "sampling/importance_sampling_ratio/min": 0.6481713652610779, + "sampling/sampling_logp_difference/max": 0.5493764877319336, + "sampling/sampling_logp_difference/mean": 0.013977523893117905, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 163.796875, + "completions/mean_terminated_length": 163.796875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3286457061767578, + "epoch": 1.6875, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028985537231318456, + "kl": 0.03047393634915352, + "learning_rate": 4.804145815774786e-07, + "loss": 0.0003, + "num_tokens": 43373970.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4068397283554077, + "sampling/importance_sampling_ratio/mean": 1.0006426572799683, + "sampling/importance_sampling_ratio/min": 0.6209167838096619, + "sampling/sampling_logp_difference/max": 0.47655820846557617, + "sampling/sampling_logp_difference/mean": 0.014818085357546806, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 212.890625, + "completions/mean_terminated_length": 212.890625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.29821670055389404, + "epoch": 1.6887254901960784, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03384399312511881, + "kl": 0.027891390025615692, + "learning_rate": 4.797027691413267e-07, + "loss": 0.0003, + "num_tokens": 43402059.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.459165334701538, + "sampling/importance_sampling_ratio/mean": 1.000119924545288, + "sampling/importance_sampling_ratio/min": 0.624224841594696, + "sampling/sampling_logp_difference/max": 0.4712446928024292, + "sampling/sampling_logp_difference/mean": 0.01404886320233345, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 210.0625, + "completions/mean_terminated_length": 210.0625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.36395466327667236, + "epoch": 1.6899509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03126411478502101, + "kl": 0.03434750437736511, + "learning_rate": 4.789909979072673e-07, + "loss": 0.0003, + "num_tokens": 43436975.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.693570852279663, + "sampling/importance_sampling_ratio/mean": 1.0003466606140137, + "sampling/importance_sampling_ratio/min": 0.643545389175415, + "sampling/sampling_logp_difference/max": 0.5268392562866211, + "sampling/sampling_logp_difference/mean": 0.014872172847390175, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 176.59375, + "completions/mean_terminated_length": 176.59375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3995271325111389, + "epoch": 1.6911764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02820292967795783, + "kl": 0.03533459082245827, + "learning_rate": 4.782792693201513e-07, + "loss": 0.0003, + "num_tokens": 43465701.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.663543939590454, + "sampling/importance_sampling_ratio/mean": 0.9997713565826416, + "sampling/importance_sampling_ratio/min": 0.5771487951278687, + "sampling/sampling_logp_difference/max": 0.5496551990509033, + "sampling/sampling_logp_difference/mean": 0.016274357214570045, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 175.9375, + "completions/mean_terminated_length": 175.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3463321328163147, + "epoch": 1.6924019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026718688673992796, + "kl": 0.03018825501203537, + "learning_rate": 4.775675848247427e-07, + "loss": 0.0003, + "num_tokens": 43495425.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3740402460098267, + "sampling/importance_sampling_ratio/mean": 1.0006554126739502, + "sampling/importance_sampling_ratio/min": 0.6716427206993103, + "sampling/sampling_logp_difference/max": 0.39802873134613037, + "sampling/sampling_logp_difference/mean": 0.014302469789981842, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 168.390625, + "completions/mean_terminated_length": 168.390625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.33125466108322144, + "epoch": 1.6936274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0846404275939687, + "kl": 0.048324376344680786, + "learning_rate": 4.768559458657155e-07, + "loss": -0.0115, + "num_tokens": 43521210.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.7379939556121826, + "sampling/importance_sampling_ratio/mean": 0.9997254610061646, + "sampling/importance_sampling_ratio/min": 0.6437204480171204, + "sampling/sampling_logp_difference/max": 0.5527315139770508, + "sampling/sampling_logp_difference/mean": 0.015096197836101055, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.33917608857154846, + "epoch": 1.6948529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0286757095713985, + "kl": 0.04030965641140938, + "learning_rate": 4.7614435388765203e-07, + "loss": 0.0118, + "num_tokens": 43558202.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4172824621200562, + "sampling/importance_sampling_ratio/mean": 1.000497817993164, + "sampling/importance_sampling_ratio/min": 0.6317176818847656, + "sampling/sampling_logp_difference/max": 0.45931267738342285, + "sampling/sampling_logp_difference/mean": 0.014552392065525055, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 160.6875, + "completions/mean_terminated_length": 160.6875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3247225880622864, + "epoch": 1.696078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03208284249245446, + "kl": 0.03661260008811951, + "learning_rate": 4.7543281033503885e-07, + "loss": 0.0003, + "num_tokens": 43587446.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5082286596298218, + "sampling/importance_sampling_ratio/mean": 0.9997931122779846, + "sampling/importance_sampling_ratio/min": 0.6297659277915955, + "sampling/sampling_logp_difference/max": 0.46240711212158203, + "sampling/sampling_logp_difference/mean": 0.015131104737520218, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 229.3125, + "completions/mean_terminated_length": 229.3125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3170344829559326, + "epoch": 1.6973039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02952600609357999, + "kl": 0.02648162469267845, + "learning_rate": 4.747213166522644e-07, + "loss": 0.0003, + "num_tokens": 43620186.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4534229040145874, + "sampling/importance_sampling_ratio/mean": 1.0002045631408691, + "sampling/importance_sampling_ratio/min": 0.5138672590255737, + "sampling/sampling_logp_difference/max": 0.665790319442749, + "sampling/sampling_logp_difference/mean": 0.013356797397136688, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 174.703125, + "completions/mean_terminated_length": 174.703125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.34353625774383545, + "epoch": 1.6985294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030580744517143644, + "kl": 0.029760949313640594, + "learning_rate": 4.740098742836156e-07, + "loss": 0.0003, + "num_tokens": 43645447.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3198062181472778, + "sampling/importance_sampling_ratio/mean": 1.0000088214874268, + "sampling/importance_sampling_ratio/min": 0.6282213926315308, + "sampling/sampling_logp_difference/max": 0.464862585067749, + "sampling/sampling_logp_difference/mean": 0.014398678205907345, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 211.875, + "completions/mean_terminated_length": 211.875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.37205174565315247, + "epoch": 1.6997549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7543692809234275, + "kl": 0.042431462556123734, + "learning_rate": 4.732984846732755e-07, + "loss": -0.0045, + "num_tokens": 43677231.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5407931804656982, + "sampling/importance_sampling_ratio/mean": 1.000165581703186, + "sampling/importance_sampling_ratio/min": 0.627170741558075, + "sampling/sampling_logp_difference/max": 0.4665365219116211, + "sampling/sampling_logp_difference/mean": 0.015096995048224926, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 205.28125, + "completions/mean_terminated_length": 205.28125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.2494359165430069, + "epoch": 1.7009803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017768629845976415, + "kl": 0.020800625905394554, + "learning_rate": 4.725871492653199e-07, + "loss": 0.0002, + "num_tokens": 43708065.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4053384065628052, + "sampling/importance_sampling_ratio/mean": 1.0005443096160889, + "sampling/importance_sampling_ratio/min": 0.4785323441028595, + "sampling/sampling_logp_difference/max": 0.7370314598083496, + "sampling/sampling_logp_difference/mean": 0.011222817935049534, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 182.171875, + "completions/mean_terminated_length": 182.171875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.30386781692504883, + "epoch": 1.7022058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031091348803848076, + "kl": 0.02845875173807144, + "learning_rate": 4.718758695037149e-07, + "loss": 0.0003, + "num_tokens": 43735580.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.575518012046814, + "sampling/importance_sampling_ratio/mean": 1.000417947769165, + "sampling/importance_sampling_ratio/min": 0.717453122138977, + "sampling/sampling_logp_difference/max": 0.45458412170410156, + "sampling/sampling_logp_difference/mean": 0.013841615989804268, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 215.328125, + "completions/mean_terminated_length": 215.328125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3812558054924011, + "epoch": 1.7034313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9062121487325195, + "kl": 0.03396788612008095, + "learning_rate": 4.7116464683231285e-07, + "loss": -0.0197, + "num_tokens": 43772001.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6544450521469116, + "sampling/importance_sampling_ratio/mean": 1.0001673698425293, + "sampling/importance_sampling_ratio/min": 0.705818772315979, + "sampling/sampling_logp_difference/max": 0.5034656524658203, + "sampling/sampling_logp_difference/mean": 0.015301933512091637, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 208.28125, + "completions/mean_terminated_length": 208.28125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.32804757356643677, + "epoch": 1.704656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017757286513298963, + "kl": 0.025670060887932777, + "learning_rate": 4.704534826948509e-07, + "loss": 0.0002, + "num_tokens": 43806579.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998500347137451, + "sampling/importance_sampling_ratio/min": 0.620639443397522, + "sampling/sampling_logp_difference/max": 0.809590220451355, + "sampling/sampling_logp_difference/mean": 0.01399231143295765, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 176.328125, + "completions/mean_terminated_length": 176.328125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.27521198987960815, + "epoch": 1.7058823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02977600678266604, + "kl": 0.03567678481340408, + "learning_rate": 4.6974237853494744e-07, + "loss": 0.0003, + "num_tokens": 43836936.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3147550821304321, + "sampling/importance_sampling_ratio/mean": 0.9992107152938843, + "sampling/importance_sampling_ratio/min": 0.6593252420425415, + "sampling/sampling_logp_difference/max": 0.4165383577346802, + "sampling/sampling_logp_difference/mean": 0.013566594570875168, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 206.359375, + "completions/mean_terminated_length": 206.359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3251534700393677, + "epoch": 1.7071078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020232729503715307, + "kl": 0.02814178541302681, + "learning_rate": 4.690313357960985e-07, + "loss": 0.0003, + "num_tokens": 43872319.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.581284761428833, + "sampling/importance_sampling_ratio/mean": 0.999963104724884, + "sampling/importance_sampling_ratio/min": 0.6158772706985474, + "sampling/sampling_logp_difference/max": 0.4847075939178467, + "sampling/sampling_logp_difference/mean": 0.0141825620085001, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 184.140625, + "completions/mean_terminated_length": 184.140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.41460761427879333, + "epoch": 1.7083333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022121295935993986, + "kl": 0.028628060594201088, + "learning_rate": 4.68320355921676e-07, + "loss": 0.0003, + "num_tokens": 43901976.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4436148405075073, + "sampling/importance_sampling_ratio/mean": 0.9999883770942688, + "sampling/importance_sampling_ratio/min": 0.6721982359886169, + "sampling/sampling_logp_difference/max": 0.3972020149230957, + "sampling/sampling_logp_difference/mean": 0.01625676453113556, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 256.0625, + "completions/mean_terminated_length": 256.0625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3149504065513611, + "epoch": 1.7095588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023421245005584058, + "kl": 0.031986355781555176, + "learning_rate": 4.67609440354924e-07, + "loss": 0.0003, + "num_tokens": 43939308.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5744267702102661, + "sampling/importance_sampling_ratio/mean": 0.9998037815093994, + "sampling/importance_sampling_ratio/min": 0.3797297179698944, + "sampling/sampling_logp_difference/max": 0.9682955741882324, + "sampling/sampling_logp_difference/mean": 0.013182253576815128, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 246.578125, + "completions/mean_terminated_length": 246.578125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.48535847663879395, + "epoch": 1.7107843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9976000775411583, + "kl": 0.0374327227473259, + "learning_rate": 4.668985905389563e-07, + "loss": -0.0363, + "num_tokens": 43977809.0, + "reward": 0.53125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4041378498077393, + "sampling/importance_sampling_ratio/mean": 1.0008046627044678, + "sampling/importance_sampling_ratio/min": 0.6904107332229614, + "sampling/sampling_logp_difference/max": 0.3704686164855957, + "sampling/sampling_logp_difference/mean": 0.01654989831149578, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 224.859375, + "completions/mean_terminated_length": 224.859375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.38829389214515686, + "epoch": 1.7120098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02345620649634188, + "kl": 0.02460196614265442, + "learning_rate": 4.661878079167526e-07, + "loss": 0.0002, + "num_tokens": 44015032.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6182984113693237, + "sampling/importance_sampling_ratio/mean": 0.9992150068283081, + "sampling/importance_sampling_ratio/min": 0.42074137926101685, + "sampling/sampling_logp_difference/max": 0.8657369613647461, + "sampling/sampling_logp_difference/mean": 0.015943851321935654, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 172.375, + "completions/mean_terminated_length": 172.375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2884783148765564, + "epoch": 1.7132352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024885221888654482, + "kl": 0.024858683347702026, + "learning_rate": 4.6547709393115677e-07, + "loss": 0.0003, + "num_tokens": 44041216.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3691984415054321, + "sampling/importance_sampling_ratio/mean": 1.000344157218933, + "sampling/importance_sampling_ratio/min": 0.6060724258422852, + "sampling/sampling_logp_difference/max": 0.500755786895752, + "sampling/sampling_logp_difference/mean": 0.013669838197529316, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 206.5625, + "completions/mean_terminated_length": 206.5625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3687763512134552, + "epoch": 1.7144607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016980934105717046, + "kl": 0.021134931594133377, + "learning_rate": 4.6476645002487295e-07, + "loss": 0.0002, + "num_tokens": 44076788.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.511935830116272, + "sampling/importance_sampling_ratio/mean": 1.0003235340118408, + "sampling/importance_sampling_ratio/min": 0.6470901966094971, + "sampling/sampling_logp_difference/max": 0.4352695941925049, + "sampling/sampling_logp_difference/mean": 0.014946680516004562, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 212.703125, + "completions/mean_terminated_length": 212.703125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.3485764265060425, + "epoch": 1.715686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016084272629488215, + "kl": 0.022283107042312622, + "learning_rate": 4.640558776404639e-07, + "loss": 0.0002, + "num_tokens": 44113233.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.633171558380127, + "sampling/importance_sampling_ratio/mean": 0.9996302127838135, + "sampling/importance_sampling_ratio/min": 0.5327483415603638, + "sampling/sampling_logp_difference/max": 0.6297061443328857, + "sampling/sampling_logp_difference/mean": 0.01467338390648365, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 192.1875, + "completions/mean_terminated_length": 192.1875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.3813598155975342, + "epoch": 1.7169117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02383066227556483, + "kl": 0.030649978667497635, + "learning_rate": 4.633453782203458e-07, + "loss": 0.0003, + "num_tokens": 44140093.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 1.0003618001937866, + "sampling/importance_sampling_ratio/min": 0.6177483201026917, + "sampling/sampling_logp_difference/max": 0.4816741943359375, + "sampling/sampling_logp_difference/mean": 0.015897084027528763, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 211.15625, + "completions/mean_terminated_length": 211.15625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4610140323638916, + "epoch": 1.718137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.086954599687401, + "kl": 0.05906593054533005, + "learning_rate": 4.626349532067879e-07, + "loss": -0.0141, + "num_tokens": 44171975.0, + "reward": 0.65625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5080199241638184, + "sampling/importance_sampling_ratio/mean": 1.0002214908599854, + "sampling/importance_sampling_ratio/min": 0.7183539867401123, + "sampling/sampling_logp_difference/max": 0.41079747676849365, + "sampling/sampling_logp_difference/mean": 0.017302053049206734, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 162.765625, + "completions/mean_terminated_length": 162.765625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.25882458686828613, + "epoch": 1.719362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025924261751398732, + "kl": 0.027556277811527252, + "learning_rate": 4.6192460404190793e-07, + "loss": 0.0003, + "num_tokens": 44199656.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6110548973083496, + "sampling/importance_sampling_ratio/mean": 0.9999560117721558, + "sampling/importance_sampling_ratio/min": 0.631548285484314, + "sampling/sampling_logp_difference/max": 0.47688913345336914, + "sampling/sampling_logp_difference/mean": 0.012271011248230934, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 217.8125, + "completions/mean_terminated_length": 217.8125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.39787495136260986, + "epoch": 1.7205882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7389451406692283, + "kl": 0.04811471700668335, + "learning_rate": 4.6121433216766935e-07, + "loss": 0.0022, + "num_tokens": 44232796.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4469801187515259, + "sampling/importance_sampling_ratio/mean": 0.9995118379592896, + "sampling/importance_sampling_ratio/min": 0.6057416200637817, + "sampling/sampling_logp_difference/max": 0.5013017654418945, + "sampling/sampling_logp_difference/mean": 0.016226064413785934, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 237.046875, + "completions/mean_terminated_length": 237.046875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3010721802711487, + "epoch": 1.721813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012379549934276401, + "kl": 0.018051059916615486, + "learning_rate": 4.605041390258794e-07, + "loss": 0.0002, + "num_tokens": 44265871.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4354110956192017, + "sampling/importance_sampling_ratio/mean": 1.0000429153442383, + "sampling/importance_sampling_ratio/min": 0.6600005030632019, + "sampling/sampling_logp_difference/max": 0.4155147075653076, + "sampling/sampling_logp_difference/mean": 0.012983039021492004, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 224.84375, + "completions/mean_terminated_length": 224.84375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3763827681541443, + "epoch": 1.7230392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8051493766718345, + "kl": 0.03543316572904587, + "learning_rate": 4.5979402605818514e-07, + "loss": 0.0311, + "num_tokens": 44299413.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5788308382034302, + "sampling/importance_sampling_ratio/mean": 0.9999749660491943, + "sampling/importance_sampling_ratio/min": 0.6668434143066406, + "sampling/sampling_logp_difference/max": 0.45668458938598633, + "sampling/sampling_logp_difference/mean": 0.014188934117555618, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 181.046875, + "completions/mean_terminated_length": 181.046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3947075307369232, + "epoch": 1.7242647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0450270633718066, + "kl": 0.03533687815070152, + "learning_rate": 4.5908399470607104e-07, + "loss": 0.0003, + "num_tokens": 44327464.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.600734829902649, + "sampling/importance_sampling_ratio/mean": 0.9991427063941956, + "sampling/importance_sampling_ratio/min": 0.5616891384124756, + "sampling/sampling_logp_difference/max": 0.5768067836761475, + "sampling/sampling_logp_difference/mean": 0.016672521829605103, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 239.109375, + "completions/mean_terminated_length": 239.109375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5134605765342712, + "epoch": 1.7254901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7564047078937739, + "kl": 0.03194734454154968, + "learning_rate": 4.5837404641085535e-07, + "loss": 0.0078, + "num_tokens": 44369247.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5701671838760376, + "sampling/importance_sampling_ratio/mean": 0.9999627470970154, + "sampling/importance_sampling_ratio/min": 0.6398777961730957, + "sampling/sampling_logp_difference/max": 0.45118212699890137, + "sampling/sampling_logp_difference/mean": 0.01758524775505066, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 211.09375, + "completions/mean_terminated_length": 211.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.30869802832603455, + "epoch": 1.7267156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013130937363080112, + "kl": 0.019469216465950012, + "learning_rate": 4.576641826136884e-07, + "loss": 0.0002, + "num_tokens": 44401269.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4367069005966187, + "sampling/importance_sampling_ratio/mean": 1.0001226663589478, + "sampling/importance_sampling_ratio/min": 0.512790858745575, + "sampling/sampling_logp_difference/max": 0.6678872108459473, + "sampling/sampling_logp_difference/mean": 0.012972285971045494, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 178.90625, + "completions/mean_terminated_length": 178.90625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.37025976181030273, + "epoch": 1.7279411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9805768712694561, + "kl": 0.0347997285425663, + "learning_rate": 4.5695440475554864e-07, + "loss": -0.0126, + "num_tokens": 44429775.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0007288455963135, + "sampling/importance_sampling_ratio/min": 0.637222945690155, + "sampling/sampling_logp_difference/max": 0.4506356716156006, + "sampling/sampling_logp_difference/mean": 0.01550462655723095, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 237.78125, + "completions/mean_terminated_length": 237.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.43542617559432983, + "epoch": 1.7291666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8403159944524539, + "kl": 0.04189896583557129, + "learning_rate": 4.5624471427724036e-07, + "loss": 0.001, + "num_tokens": 44458977.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.428770661354065, + "sampling/importance_sampling_ratio/mean": 0.9994067549705505, + "sampling/importance_sampling_ratio/min": 0.4924968481063843, + "sampling/sampling_logp_difference/max": 0.7082672119140625, + "sampling/sampling_logp_difference/mean": 0.016688797622919083, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 210.328125, + "completions/mean_terminated_length": 210.328125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3765452802181244, + "epoch": 1.7303921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020520693560359245, + "kl": 0.029866300523281097, + "learning_rate": 4.5553511261939e-07, + "loss": 0.0003, + "num_tokens": 44491766.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5136628150939941, + "sampling/importance_sampling_ratio/mean": 0.999915599822998, + "sampling/importance_sampling_ratio/min": 0.6295127868652344, + "sampling/sampling_logp_difference/max": 0.46280908584594727, + "sampling/sampling_logp_difference/mean": 0.014515403658151627, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 197.859375, + "completions/mean_terminated_length": 197.859375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3265450596809387, + "epoch": 1.7316176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02309653591519408, + "kl": 0.030018430203199387, + "learning_rate": 4.5482560122244407e-07, + "loss": 0.0003, + "num_tokens": 44518845.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.600726842880249, + "sampling/importance_sampling_ratio/mean": 1.0002501010894775, + "sampling/importance_sampling_ratio/min": 0.6692793965339661, + "sampling/sampling_logp_difference/max": 0.4704577922821045, + "sampling/sampling_logp_difference/mean": 0.014397569000720978, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 280.9375, + "completions/mean_terminated_length": 280.9375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3380570113658905, + "epoch": 1.732843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6330278044158965, + "kl": 0.022636758163571358, + "learning_rate": 4.541161815266658e-07, + "loss": 0.023, + "num_tokens": 44554841.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.875036358833313, + "sampling/importance_sampling_ratio/mean": 0.9998974204063416, + "sampling/importance_sampling_ratio/min": 0.6907591223716736, + "sampling/sampling_logp_difference/max": 0.6286280155181885, + "sampling/sampling_logp_difference/mean": 0.0126770855858922, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 192.625, + "completions/mean_terminated_length": 192.625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.36764997243881226, + "epoch": 1.7340686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0385506943948144, + "kl": 0.02965870499610901, + "learning_rate": 4.534068549721324e-07, + "loss": -0.0239, + "num_tokens": 44582017.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.388352870941162, + "sampling/importance_sampling_ratio/mean": 1.0002903938293457, + "sampling/importance_sampling_ratio/min": 0.6088035106658936, + "sampling/sampling_logp_difference/max": 0.4962596893310547, + "sampling/sampling_logp_difference/mean": 0.0152654517441988, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 211.328125, + "completions/mean_terminated_length": 211.328125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.474894642829895, + "epoch": 1.7352941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0246964759475967, + "kl": 0.03367047384381294, + "learning_rate": 4.5269762299873144e-07, + "loss": -0.0211, + "num_tokens": 44616950.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.396409034729004, + "sampling/importance_sampling_ratio/mean": 0.9997467398643494, + "sampling/importance_sampling_ratio/min": 0.6573418378829956, + "sampling/sampling_logp_difference/max": 0.41955113410949707, + "sampling/sampling_logp_difference/mean": 0.017158398404717445, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 232.859375, + "completions/mean_terminated_length": 232.859375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.42667698860168457, + "epoch": 1.7365196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6762500874465751, + "kl": 0.04127083718776703, + "learning_rate": 4.519884870461591e-07, + "loss": -0.0073, + "num_tokens": 44651453.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.7663496732711792, + "sampling/importance_sampling_ratio/mean": 0.9996098279953003, + "sampling/importance_sampling_ratio/min": 0.6298543214797974, + "sampling/sampling_logp_difference/max": 0.5689151287078857, + "sampling/sampling_logp_difference/mean": 0.016091158613562584, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 166.46875, + "completions/mean_terminated_length": 166.46875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3405911922454834, + "epoch": 1.7377450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.123686574209456, + "kl": 0.04614399001002312, + "learning_rate": 4.512794485539165e-07, + "loss": -0.024, + "num_tokens": 44675787.0, + "reward": 0.625, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.381252408027649, + "sampling/importance_sampling_ratio/mean": 0.999666690826416, + "sampling/importance_sampling_ratio/min": 0.6393997073173523, + "sampling/sampling_logp_difference/max": 0.44722557067871094, + "sampling/sampling_logp_difference/mean": 0.01435903925448656, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 201.390625, + "completions/mean_terminated_length": 201.390625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3742409348487854, + "epoch": 1.7389705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7028407306720601, + "kl": 0.02892223931849003, + "learning_rate": 4.505705089613068e-07, + "loss": -0.0251, + "num_tokens": 44705412.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5027868747711182, + "sampling/importance_sampling_ratio/mean": 1.0004254579544067, + "sampling/importance_sampling_ratio/min": 0.6805267333984375, + "sampling/sampling_logp_difference/max": 0.40732133388519287, + "sampling/sampling_logp_difference/mean": 0.014049705117940903, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 177.609375, + "completions/mean_terminated_length": 177.609375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.39864909648895264, + "epoch": 1.7401960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.017394567395279, + "kl": 0.0507853738963604, + "learning_rate": 4.4986166970743233e-07, + "loss": 0.0066, + "num_tokens": 44730987.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5071645975112915, + "sampling/importance_sampling_ratio/mean": 1.0001134872436523, + "sampling/importance_sampling_ratio/min": 0.6924469470977783, + "sampling/sampling_logp_difference/max": 0.4102301597595215, + "sampling/sampling_logp_difference/mean": 0.015454644337296486, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 195.578125, + "completions/mean_terminated_length": 195.578125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3866695463657379, + "epoch": 1.741421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8594666401532325, + "kl": 0.029470369219779968, + "learning_rate": 4.4915293223119205e-07, + "loss": 0.0295, + "num_tokens": 44759536.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4187947511672974, + "sampling/importance_sampling_ratio/mean": 1.0001866817474365, + "sampling/importance_sampling_ratio/min": 0.7082564830780029, + "sampling/sampling_logp_difference/max": 0.3498077392578125, + "sampling/sampling_logp_difference/mean": 0.015881307423114777, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 219.546875, + "completions/mean_terminated_length": 219.546875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.5283569097518921, + "epoch": 1.7426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8813196512801895, + "kl": 0.04730577394366264, + "learning_rate": 4.484442979712783e-07, + "loss": -0.0017, + "num_tokens": 44796531.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4558353424072266, + "sampling/importance_sampling_ratio/mean": 1.0001245737075806, + "sampling/importance_sampling_ratio/min": 0.6180657744407654, + "sampling/sampling_logp_difference/max": 0.48116040229797363, + "sampling/sampling_logp_difference/mean": 0.01736762933433056, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 152.03125, + "completions/mean_terminated_length": 152.03125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3932775855064392, + "epoch": 1.7438725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9309149058833824, + "kl": 0.06567516922950745, + "learning_rate": 4.477357683661733e-07, + "loss": -0.0048, + "num_tokens": 44820533.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5347106456756592, + "sampling/importance_sampling_ratio/mean": 1.000542402267456, + "sampling/importance_sampling_ratio/min": 0.4917859435081482, + "sampling/sampling_logp_difference/max": 0.7097117900848389, + "sampling/sampling_logp_difference/mean": 0.015332138165831566, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 211.03125, + "completions/mean_terminated_length": 211.03125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.4433809518814087, + "epoch": 1.7450980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7713792047364969, + "kl": 0.05297395586967468, + "learning_rate": 4.470273448541475e-07, + "loss": 0.0, + "num_tokens": 44848423.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.9679169654846191, + "sampling/importance_sampling_ratio/mean": 0.9995995759963989, + "sampling/importance_sampling_ratio/min": 0.6510018110275269, + "sampling/sampling_logp_difference/max": 0.6769756078720093, + "sampling/sampling_logp_difference/mean": 0.01568237505853176, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 230.109375, + "completions/mean_terminated_length": 230.109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.44413939118385315, + "epoch": 1.7463235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.865555639371449, + "kl": 0.04160609841346741, + "learning_rate": 4.4631902887325567e-07, + "loss": 0.0115, + "num_tokens": 44885422.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6075966358184814, + "sampling/importance_sampling_ratio/mean": 0.9996904134750366, + "sampling/importance_sampling_ratio/min": 0.6300875544548035, + "sampling/sampling_logp_difference/max": 0.47474026679992676, + "sampling/sampling_logp_difference/mean": 0.015817858278751373, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 199.265625, + "completions/mean_terminated_length": 199.265625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3973761796951294, + "epoch": 1.7475490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.882389261024687, + "kl": 0.06518774479627609, + "learning_rate": 4.4561082186133456e-07, + "loss": -0.0007, + "num_tokens": 44910751.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4429669380187988, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.7577892541885376, + "sampling/sampling_logp_difference/max": 0.3667013645172119, + "sampling/sampling_logp_difference/mean": 0.014969083480536938, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 189.375, + "completions/mean_terminated_length": 189.375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4147040545940399, + "epoch": 1.7487745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9431794753665337, + "kl": 0.026446394622325897, + "learning_rate": 4.4490272525599936e-07, + "loss": 0.0189, + "num_tokens": 44943079.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5475490093231201, + "sampling/importance_sampling_ratio/mean": 1.0002816915512085, + "sampling/importance_sampling_ratio/min": 0.6187689304351807, + "sampling/sampling_logp_difference/max": 0.4800233840942383, + "sampling/sampling_logp_difference/mean": 0.01538955420255661, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 183.9375, + "completions/mean_terminated_length": 183.9375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4222148060798645, + "epoch": 1.75, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9239683154757388, + "kl": 0.0388445146381855, + "learning_rate": 4.4419474049464135e-07, + "loss": 0.0091, + "num_tokens": 44969763.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.291072130203247, + "sampling/importance_sampling_ratio/mean": 1.0004539489746094, + "sampling/importance_sampling_ratio/min": 0.7533888816833496, + "sampling/sampling_logp_difference/max": 0.2831737995147705, + "sampling/sampling_logp_difference/mean": 0.014346310868859291, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 213.484375, + "completions/mean_terminated_length": 213.484375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.37551143765449524, + "epoch": 1.7512254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7062470331062345, + "kl": 0.04760557413101196, + "learning_rate": 4.43486869014425e-07, + "loss": 0.0071, + "num_tokens": 45004482.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4682234525680542, + "sampling/importance_sampling_ratio/mean": 0.999937891960144, + "sampling/importance_sampling_ratio/min": 0.6740283370018005, + "sampling/sampling_logp_difference/max": 0.39448320865631104, + "sampling/sampling_logp_difference/mean": 0.0134231336414814, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 238.234375, + "completions/mean_terminated_length": 238.234375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.38191598653793335, + "epoch": 1.7524509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02758437666281902, + "kl": 0.034000493586063385, + "learning_rate": 4.427791122522841e-07, + "loss": 0.0003, + "num_tokens": 45046049.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4964890480041504, + "sampling/importance_sampling_ratio/mean": 1.0001420974731445, + "sampling/importance_sampling_ratio/min": 0.6302828788757324, + "sampling/sampling_logp_difference/max": 0.46158647537231445, + "sampling/sampling_logp_difference/mean": 0.014439761638641357, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 235.859375, + "completions/mean_terminated_length": 235.859375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3631054162979126, + "epoch": 1.7536764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7424446221644718, + "kl": 0.039311766624450684, + "learning_rate": 4.420714716449203e-07, + "loss": -0.0052, + "num_tokens": 45078072.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4538235664367676, + "sampling/importance_sampling_ratio/mean": 0.9998072385787964, + "sampling/importance_sampling_ratio/min": 0.6587311625480652, + "sampling/sampling_logp_difference/max": 0.4174398183822632, + "sampling/sampling_logp_difference/mean": 0.01360815018415451, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 223.140625, + "completions/mean_terminated_length": 223.140625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3967345952987671, + "epoch": 1.7549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5323812009704646, + "kl": 0.033599622547626495, + "learning_rate": 4.413639486287991e-07, + "loss": -0.018, + "num_tokens": 45110897.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6360160112380981, + "sampling/importance_sampling_ratio/mean": 1.0001624822616577, + "sampling/importance_sampling_ratio/min": 0.7722803950309753, + "sampling/sampling_logp_difference/max": 0.4922640323638916, + "sampling/sampling_logp_difference/mean": 0.013910435140132904, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 202.84375, + "completions/mean_terminated_length": 202.84375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.41037964820861816, + "epoch": 1.7561274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019878875723460726, + "kl": 0.03348096087574959, + "learning_rate": 4.406565446401476e-07, + "loss": 0.0003, + "num_tokens": 45139751.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5786607265472412, + "sampling/importance_sampling_ratio/mean": 1.000056266784668, + "sampling/importance_sampling_ratio/min": 0.7140791416168213, + "sampling/sampling_logp_difference/max": 0.4565768241882324, + "sampling/sampling_logp_difference/mean": 0.015522721223533154, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 184.078125, + "completions/mean_terminated_length": 184.078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4791666865348816, + "epoch": 1.7573529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.127227683632228, + "kl": 0.05700379237532616, + "learning_rate": 4.399492611149509e-07, + "loss": 0.0358, + "num_tokens": 45168332.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4940515756607056, + "sampling/importance_sampling_ratio/mean": 1.000679850578308, + "sampling/importance_sampling_ratio/min": 0.7372725605964661, + "sampling/sampling_logp_difference/max": 0.401491641998291, + "sampling/sampling_logp_difference/mean": 0.017366910353302956, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.5041054487228394, + "epoch": 1.758578431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7247880014361506, + "kl": 0.041422732174396515, + "learning_rate": 4.392420994889498e-07, + "loss": -0.0423, + "num_tokens": 45199676.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5277611017227173, + "sampling/importance_sampling_ratio/mean": 1.0000765323638916, + "sampling/importance_sampling_ratio/min": 0.705386757850647, + "sampling/sampling_logp_difference/max": 0.42380332946777344, + "sampling/sampling_logp_difference/mean": 0.017310116440057755, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 191.578125, + "completions/mean_terminated_length": 191.578125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.40415358543395996, + "epoch": 1.7598039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024389198589871545, + "kl": 0.03791702538728714, + "learning_rate": 4.385350611976376e-07, + "loss": 0.0003, + "num_tokens": 45227505.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3794703483581543, + "sampling/importance_sampling_ratio/mean": 1.0002727508544922, + "sampling/importance_sampling_ratio/min": 0.6700911521911621, + "sampling/sampling_logp_difference/max": 0.4003415107727051, + "sampling/sampling_logp_difference/mean": 0.01557602733373642, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 197.6875, + "completions/mean_terminated_length": 197.6875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.48839443922042847, + "epoch": 1.7610294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.747341235613586, + "kl": 0.03958131745457649, + "learning_rate": 4.3782814767625755e-07, + "loss": -0.0077, + "num_tokens": 45258205.0, + "reward": -0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4346733093261719, + "sampling/importance_sampling_ratio/mean": 1.0000269412994385, + "sampling/importance_sampling_ratio/min": 0.6903257966041565, + "sampling/sampling_logp_difference/max": 0.3705916404724121, + "sampling/sampling_logp_difference/mean": 0.01604308933019638, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 248.734375, + "completions/mean_terminated_length": 248.734375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.46708595752716064, + "epoch": 1.7622549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7063798639318799, + "kl": 0.04332081228494644, + "learning_rate": 4.371213603597987e-07, + "loss": -0.0109, + "num_tokens": 45290860.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.3585596084594727, + "sampling/importance_sampling_ratio/mean": 1.0002461671829224, + "sampling/importance_sampling_ratio/min": 0.5722141861915588, + "sampling/sampling_logp_difference/max": 0.5582419633865356, + "sampling/sampling_logp_difference/mean": 0.015267834067344666, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 237.03125, + "completions/mean_terminated_length": 237.03125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.40694278478622437, + "epoch": 1.7634803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01716887672229882, + "kl": 0.028053196147084236, + "learning_rate": 4.3641470068299483e-07, + "loss": 0.0003, + "num_tokens": 45331198.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.313656210899353, + "sampling/importance_sampling_ratio/mean": 1.0002212524414062, + "sampling/importance_sampling_ratio/min": 0.6681939363479614, + "sampling/sampling_logp_difference/max": 0.40317678451538086, + "sampling/sampling_logp_difference/mean": 0.01535987388342619, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 249.90625, + "completions/mean_terminated_length": 249.90625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.46517300605773926, + "epoch": 1.7647058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0968058975377974, + "kl": 0.03315602242946625, + "learning_rate": 4.3570817008032044e-07, + "loss": 0.0297, + "num_tokens": 45364568.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.408729910850525, + "sampling/importance_sampling_ratio/mean": 0.9997270703315735, + "sampling/importance_sampling_ratio/min": 0.6681921482086182, + "sampling/sampling_logp_difference/max": 0.4031795263290405, + "sampling/sampling_logp_difference/mean": 0.013626371510326862, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 148.9375, + "completions/mean_terminated_length": 148.9375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3735087811946869, + "epoch": 1.7659313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0202814849083852, + "kl": 0.061778899282217026, + "learning_rate": 4.350017699859877e-07, + "loss": 0.0152, + "num_tokens": 45386980.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.535566806793213, + "sampling/importance_sampling_ratio/mean": 1.0005061626434326, + "sampling/importance_sampling_ratio/min": 0.6316543221473694, + "sampling/sampling_logp_difference/max": 0.4594130516052246, + "sampling/sampling_logp_difference/mean": 0.01528320275247097, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 256.125, + "completions/mean_terminated_length": 256.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.46313539147377014, + "epoch": 1.767156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8291787052513004, + "kl": 0.04051543399691582, + "learning_rate": 4.342955018339441e-07, + "loss": 0.0094, + "num_tokens": 45419580.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2963625192642212, + "sampling/importance_sampling_ratio/mean": 1.0001630783081055, + "sampling/importance_sampling_ratio/min": 0.6908029317855835, + "sampling/sampling_logp_difference/max": 0.3699007034301758, + "sampling/sampling_logp_difference/mean": 0.015424519777297974, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.47829362750053406, + "epoch": 1.7683823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9674432569519643, + "kl": 0.04246748983860016, + "learning_rate": 4.335893670578694e-07, + "loss": 0.0047, + "num_tokens": 45459844.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5277862548828125, + "sampling/importance_sampling_ratio/mean": 1.000131368637085, + "sampling/importance_sampling_ratio/min": 0.6370444893836975, + "sampling/sampling_logp_difference/max": 0.4509158134460449, + "sampling/sampling_logp_difference/mean": 0.015260925516486168, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 164.84375, + "completions/mean_terminated_length": 164.84375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3611081838607788, + "epoch": 1.7696078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0393042253065224, + "kl": 0.0389854833483696, + "learning_rate": 4.328833670911724e-07, + "loss": 0.0004, + "num_tokens": 45485514.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5949219465255737, + "sampling/importance_sampling_ratio/mean": 0.9993205666542053, + "sampling/importance_sampling_ratio/min": 0.5590474605560303, + "sampling/sampling_logp_difference/max": 0.5815209746360779, + "sampling/sampling_logp_difference/mean": 0.014122438617050648, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 186.59375, + "completions/mean_terminated_length": 186.59375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4266851842403412, + "epoch": 1.7708333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9178166907920076, + "kl": 0.05023089796304703, + "learning_rate": 4.3217750336698803e-07, + "loss": 0.0046, + "num_tokens": 45511424.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997497200965881, + "sampling/importance_sampling_ratio/min": 0.6100702881813049, + "sampling/sampling_logp_difference/max": 0.7019822597503662, + "sampling/sampling_logp_difference/mean": 0.015094645321369171, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 176.640625, + "completions/mean_terminated_length": 176.640625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.4793596863746643, + "epoch": 1.7720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026665708128201734, + "kl": 0.04186504706740379, + "learning_rate": 4.314717773181752e-07, + "loss": 0.0004, + "num_tokens": 45541577.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3670605421066284, + "sampling/importance_sampling_ratio/mean": 1.000087022781372, + "sampling/importance_sampling_ratio/min": 0.6345877647399902, + "sampling/sampling_logp_difference/max": 0.45477962493896484, + "sampling/sampling_logp_difference/mean": 0.016608726233243942, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 186.953125, + "completions/mean_terminated_length": 186.953125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4314137101173401, + "epoch": 1.7732843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023656792639340025, + "kl": 0.040345869958400726, + "learning_rate": 4.3076619037731287e-07, + "loss": 0.0004, + "num_tokens": 45569238.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3493237495422363, + "sampling/importance_sampling_ratio/mean": 1.0000889301300049, + "sampling/importance_sampling_ratio/min": 0.5987197756767273, + "sampling/sampling_logp_difference/max": 0.5129616260528564, + "sampling/sampling_logp_difference/mean": 0.016253970563411713, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 230.609375, + "completions/mean_terminated_length": 230.609375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.45809489488601685, + "epoch": 1.7745098039215685, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1351322335383285, + "kl": 0.031431425362825394, + "learning_rate": 4.3006074397669836e-07, + "loss": -0.0083, + "num_tokens": 45604077.0, + "reward": 0.78125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.284456491470337, + "sampling/importance_sampling_ratio/mean": 1.000319480895996, + "sampling/importance_sampling_ratio/min": 0.6482195258140564, + "sampling/sampling_logp_difference/max": 0.43352580070495605, + "sampling/sampling_logp_difference/mean": 0.01463567465543747, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 264.46875, + "completions/mean_terminated_length": 264.46875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.4789225161075592, + "epoch": 1.7757352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9112477610484371, + "kl": 0.04142525792121887, + "learning_rate": 4.293554395483425e-07, + "loss": -0.0005, + "num_tokens": 45644155.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6009043455123901, + "sampling/importance_sampling_ratio/mean": 0.9999097585678101, + "sampling/importance_sampling_ratio/min": 0.7041060924530029, + "sampling/sampling_logp_difference/max": 0.4705686569213867, + "sampling/sampling_logp_difference/mean": 0.015574609860777855, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 258.890625, + "completions/mean_terminated_length": 258.890625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.43155670166015625, + "epoch": 1.7769607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01368765642754883, + "kl": 0.024679599329829216, + "learning_rate": 4.2865027852396894e-07, + "loss": 0.0002, + "num_tokens": 45680788.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5347754955291748, + "sampling/importance_sampling_ratio/mean": 1.0001623630523682, + "sampling/importance_sampling_ratio/min": 0.7059873938560486, + "sampling/sampling_logp_difference/max": 0.42838406562805176, + "sampling/sampling_logp_difference/mean": 0.014761995524168015, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 172.328125, + "completions/mean_terminated_length": 172.328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.42772746086120605, + "epoch": 1.778186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02520302182925972, + "kl": 0.04123090207576752, + "learning_rate": 4.2794526233501004e-07, + "loss": 0.0004, + "num_tokens": 45706457.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.52882719039917, + "sampling/importance_sampling_ratio/mean": 0.9998812079429626, + "sampling/importance_sampling_ratio/min": 0.6311986446380615, + "sampling/sampling_logp_difference/max": 0.46013474464416504, + "sampling/sampling_logp_difference/mean": 0.015842996537685394, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 236.421875, + "completions/mean_terminated_length": 236.421875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.5397396683692932, + "epoch": 1.7794117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9810596928979152, + "kl": 0.07520410418510437, + "learning_rate": 4.272403924126035e-07, + "loss": 0.0173, + "num_tokens": 45739348.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5650054216384888, + "sampling/importance_sampling_ratio/mean": 1.0001192092895508, + "sampling/importance_sampling_ratio/min": 0.6933867931365967, + "sampling/sampling_logp_difference/max": 0.4478893280029297, + "sampling/sampling_logp_difference/mean": 0.017063235864043236, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 233.09375, + "completions/mean_terminated_length": 233.09375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.41079872846603394, + "epoch": 1.780637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6442710194208022, + "kl": 0.02720138430595398, + "learning_rate": 4.2653567018759103e-07, + "loss": -0.0061, + "num_tokens": 45775434.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5081772804260254, + "sampling/importance_sampling_ratio/mean": 1.0005946159362793, + "sampling/importance_sampling_ratio/min": 0.5385057926177979, + "sampling/sampling_logp_difference/max": 0.6189570426940918, + "sampling/sampling_logp_difference/mean": 0.01502861175686121, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 233.296875, + "completions/mean_terminated_length": 233.296875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.40316036343574524, + "epoch": 1.781862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6642370122603454, + "kl": 0.03852131590247154, + "learning_rate": 4.258310970905139e-07, + "loss": -0.0014, + "num_tokens": 45810909.0, + "reward": -0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5914466381072998, + "sampling/importance_sampling_ratio/mean": 0.9998533725738525, + "sampling/importance_sampling_ratio/min": 0.6117665767669678, + "sampling/sampling_logp_difference/max": 0.49140453338623047, + "sampling/sampling_logp_difference/mean": 0.01392968650907278, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 240.9375, + "completions/mean_terminated_length": 240.9375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.46248531341552734, + "epoch": 1.7830882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017064247998052156, + "kl": 0.030550135299563408, + "learning_rate": 4.251266745516112e-07, + "loss": 0.0003, + "num_tokens": 45849609.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5256630182266235, + "sampling/importance_sampling_ratio/mean": 0.9996311664581299, + "sampling/importance_sampling_ratio/min": 0.6598731875419617, + "sampling/sampling_logp_difference/max": 0.42242908477783203, + "sampling/sampling_logp_difference/mean": 0.016041986644268036, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 192.34375, + "completions/mean_terminated_length": 192.34375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.4016135334968567, + "epoch": 1.784313725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.019064933887383, + "kl": 0.03971964493393898, + "learning_rate": 4.2442240400081556e-07, + "loss": -0.0196, + "num_tokens": 45881071.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5750302076339722, + "sampling/importance_sampling_ratio/mean": 0.9997595548629761, + "sampling/importance_sampling_ratio/min": 0.4159521460533142, + "sampling/sampling_logp_difference/max": 0.8771851062774658, + "sampling/sampling_logp_difference/mean": 0.014799212105572224, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 227.890625, + "completions/mean_terminated_length": 227.890625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.35006678104400635, + "epoch": 1.7855392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.710172264158936, + "kl": 0.02738107740879059, + "learning_rate": 4.2371828686775186e-07, + "loss": 0.0069, + "num_tokens": 45916424.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.650227427482605, + "sampling/importance_sampling_ratio/mean": 1.0000296831130981, + "sampling/importance_sampling_ratio/min": 0.6437652707099915, + "sampling/sampling_logp_difference/max": 0.500913143157959, + "sampling/sampling_logp_difference/mean": 0.012949788942933083, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 200.609375, + "completions/mean_terminated_length": 200.609375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4181138277053833, + "epoch": 1.7867647058823528, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3334042675386486, + "kl": 0.05847199261188507, + "learning_rate": 4.2301432458173316e-07, + "loss": -0.0091, + "num_tokens": 45941807.0, + "reward": -0.21875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": -0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.508720874786377, + "sampling/importance_sampling_ratio/mean": 1.0001122951507568, + "sampling/importance_sampling_ratio/min": 0.6050756573677063, + "sampling/sampling_logp_difference/max": 0.5024018287658691, + "sampling/sampling_logp_difference/mean": 0.016681130975484848, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 181.015625, + "completions/mean_terminated_length": 181.015625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.46196916699409485, + "epoch": 1.7879901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1725318917114993, + "kl": 0.07137738913297653, + "learning_rate": 4.223105185717585e-07, + "loss": -0.0018, + "num_tokens": 45968352.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4689947366714478, + "sampling/importance_sampling_ratio/mean": 0.9999860525131226, + "sampling/importance_sampling_ratio/min": 0.7149273157119751, + "sampling/sampling_logp_difference/max": 0.38457822799682617, + "sampling/sampling_logp_difference/mean": 0.017508184537291527, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 203.6875, + "completions/mean_terminated_length": 203.6875, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.3804335594177246, + "epoch": 1.7892156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019368482660348265, + "kl": 0.027461178600788116, + "learning_rate": 4.216068702665093e-07, + "loss": 0.0003, + "num_tokens": 45998796.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4015306234359741, + "sampling/importance_sampling_ratio/mean": 1.0002665519714355, + "sampling/importance_sampling_ratio/min": 0.6131548881530762, + "sampling/sampling_logp_difference/max": 0.4891376495361328, + "sampling/sampling_logp_difference/mean": 0.015096787363290787, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 235.234375, + "completions/mean_terminated_length": 235.234375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.37548473477363586, + "epoch": 1.7904411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015921481218471225, + "kl": 0.025881817564368248, + "learning_rate": 4.2090338109434703e-07, + "loss": 0.0003, + "num_tokens": 46035595.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6298270225524902, + "sampling/importance_sampling_ratio/mean": 0.9997875690460205, + "sampling/importance_sampling_ratio/min": 0.6951361894607544, + "sampling/sampling_logp_difference/max": 0.48847389221191406, + "sampling/sampling_logp_difference/mean": 0.014340518973767757, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 207.984375, + "completions/mean_terminated_length": 207.984375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3004228472709656, + "epoch": 1.7916666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013667664901205656, + "kl": 0.019026055932044983, + "learning_rate": 4.202000524833105e-07, + "loss": 0.0002, + "num_tokens": 46069626.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.0003504753112793, + "sampling/importance_sampling_ratio/min": 0.6548967957496643, + "sampling/sampling_logp_difference/max": 0.42380309104919434, + "sampling/sampling_logp_difference/mean": 0.01225491613149643, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 226.90625, + "completions/mean_terminated_length": 226.90625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.39118072390556335, + "epoch": 1.7928921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016486895443134506, + "kl": 0.032455939799547195, + "learning_rate": 4.194968858611117e-07, + "loss": 0.0003, + "num_tokens": 46102884.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 1.0000404119491577, + "sampling/importance_sampling_ratio/min": 0.6678564548492432, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.013750225305557251, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 250.859375, + "completions/mean_terminated_length": 250.859375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.43674230575561523, + "epoch": 1.7941176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8923833983931393, + "kl": 0.03069932386279106, + "learning_rate": 4.187938826551346e-07, + "loss": 0.0301, + "num_tokens": 46146075.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 0.9999037981033325, + "sampling/importance_sampling_ratio/min": 0.3847516179084778, + "sampling/sampling_logp_difference/max": 0.9551572799682617, + "sampling/sampling_logp_difference/mean": 0.015772782266139984, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 160.953125, + "completions/mean_terminated_length": 160.953125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3915073275566101, + "epoch": 1.795343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0581596796611143, + "kl": 0.04968298599123955, + "learning_rate": 4.180910442924311e-07, + "loss": 0.0119, + "num_tokens": 46170456.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5006968975067139, + "sampling/importance_sampling_ratio/mean": 0.9995827078819275, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.015751594677567482, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 199.5, + "completions/mean_terminated_length": 199.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.35393333435058594, + "epoch": 1.7965686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7935912510779416, + "kl": 0.030864953994750977, + "learning_rate": 4.173883721997188e-07, + "loss": -0.0084, + "num_tokens": 46205400.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5241999626159668, + "sampling/importance_sampling_ratio/mean": 0.999489426612854, + "sampling/importance_sampling_ratio/min": 0.6623748540878296, + "sampling/sampling_logp_difference/max": 0.42146968841552734, + "sampling/sampling_logp_difference/mean": 0.013956794515252113, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 216.8125, + "completions/mean_terminated_length": 216.8125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3866092264652252, + "epoch": 1.7977941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026090015442885556, + "kl": 0.03396444022655487, + "learning_rate": 4.1668586780337713e-07, + "loss": 0.0004, + "num_tokens": 46233532.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5496832132339478, + "sampling/importance_sampling_ratio/mean": 0.9995602369308472, + "sampling/importance_sampling_ratio/min": 0.6023508906364441, + "sampling/sampling_logp_difference/max": 0.5069150924682617, + "sampling/sampling_logp_difference/mean": 0.015118034556508064, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 202.34375, + "completions/mean_terminated_length": 202.34375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3430424630641937, + "epoch": 1.7990196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.084819651898561, + "kl": 0.03494244068861008, + "learning_rate": 4.159835325294457e-07, + "loss": 0.0014, + "num_tokens": 46259394.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4407918453216553, + "sampling/importance_sampling_ratio/mean": 0.9999051094055176, + "sampling/importance_sampling_ratio/min": 0.6133283376693726, + "sampling/sampling_logp_difference/max": 0.48885488510131836, + "sampling/sampling_logp_difference/mean": 0.01425672322511673, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 157.515625, + "completions/mean_terminated_length": 157.515625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.39985328912734985, + "epoch": 1.8002450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02940842733886677, + "kl": 0.0676310583949089, + "learning_rate": 4.152813678036208e-07, + "loss": 0.0006, + "num_tokens": 46288051.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00049889087677, + "sampling/importance_sampling_ratio/min": 0.6191496253013611, + "sampling/sampling_logp_difference/max": 0.7354001998901367, + "sampling/sampling_logp_difference/mean": 0.016767770051956177, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 191.28125, + "completions/mean_terminated_length": 191.28125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4206755459308624, + "epoch": 1.8014705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4012676856935273, + "kl": 0.0800822526216507, + "learning_rate": 4.145793750512522e-07, + "loss": 0.0496, + "num_tokens": 46316501.0, + "reward": -0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6270153522491455, + "sampling/importance_sampling_ratio/mean": 0.9992214441299438, + "sampling/importance_sampling_ratio/min": 0.5298165082931519, + "sampling/sampling_logp_difference/max": 0.6352245807647705, + "sampling/sampling_logp_difference/mean": 0.017360102385282516, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 198.65625, + "completions/mean_terminated_length": 198.65625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.42161741852760315, + "epoch": 1.8026960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042891532642960015, + "kl": 0.059884097427129745, + "learning_rate": 4.1387755569734054e-07, + "loss": 0.0006, + "num_tokens": 46347743.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997487664222717, + "sampling/importance_sampling_ratio/min": 0.614323616027832, + "sampling/sampling_logp_difference/max": 0.7349467277526855, + "sampling/sampling_logp_difference/mean": 0.015134022571146488, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 204.765625, + "completions/mean_terminated_length": 204.765625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.39879900217056274, + "epoch": 1.803921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01481514788890377, + "kl": 0.025549769401550293, + "learning_rate": 4.131759111665348e-07, + "loss": 0.0003, + "num_tokens": 46381808.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5880461931228638, + "sampling/importance_sampling_ratio/mean": 1.0000556707382202, + "sampling/importance_sampling_ratio/min": 0.6291193962097168, + "sampling/sampling_logp_difference/max": 0.46343421936035156, + "sampling/sampling_logp_difference/mean": 0.01481297705322504, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 173.484375, + "completions/mean_terminated_length": 173.484375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.40065518021583557, + "epoch": 1.8051470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05139028458355681, + "kl": 0.07758373022079468, + "learning_rate": 4.1247444288312895e-07, + "loss": 0.0007, + "num_tokens": 46410367.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.367601990699768, + "sampling/importance_sampling_ratio/mean": 0.9998440146446228, + "sampling/importance_sampling_ratio/min": 0.6487096548080444, + "sampling/sampling_logp_difference/max": 0.4327700138092041, + "sampling/sampling_logp_difference/mean": 0.016905371099710464, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 256.609375, + "completions/mean_terminated_length": 256.609375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.5070643424987793, + "epoch": 1.8063725490196079, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0673609305951133, + "kl": 0.04940847307443619, + "learning_rate": 4.1177315227105926e-07, + "loss": 0.0081, + "num_tokens": 46447670.0, + "reward": 0.125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6533323526382446, + "sampling/importance_sampling_ratio/mean": 1.0003235340118408, + "sampling/importance_sampling_ratio/min": 0.6956043839454651, + "sampling/sampling_logp_difference/max": 0.5027928352355957, + "sampling/sampling_logp_difference/mean": 0.017348986119031906, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 191.140625, + "completions/mean_terminated_length": 191.140625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.36436885595321655, + "epoch": 1.8075980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7400154310148165, + "kl": 0.05223090946674347, + "learning_rate": 4.1107204075390096e-07, + "loss": -0.0058, + "num_tokens": 46473247.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3586506843566895, + "sampling/importance_sampling_ratio/mean": 0.9999962449073792, + "sampling/importance_sampling_ratio/min": 0.6302604079246521, + "sampling/sampling_logp_difference/max": 0.4616222381591797, + "sampling/sampling_logp_difference/mean": 0.013969759456813335, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 198.609375, + "completions/mean_terminated_length": 198.609375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3275294303894043, + "epoch": 1.8088235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9667105337084376, + "kl": 0.02661885693669319, + "learning_rate": 4.1037110975486617e-07, + "loss": -0.019, + "num_tokens": 46503126.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5046790838241577, + "sampling/importance_sampling_ratio/mean": 1.0004401206970215, + "sampling/importance_sampling_ratio/min": 0.5255032181739807, + "sampling/sampling_logp_difference/max": 0.6433990001678467, + "sampling/sampling_logp_difference/mean": 0.013211781159043312, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 233.578125, + "completions/mean_terminated_length": 233.578125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.33777767419815063, + "epoch": 1.8100490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026791804709812933, + "kl": 0.025295440107584, + "learning_rate": 4.096703606968006e-07, + "loss": 0.0003, + "num_tokens": 46535675.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6054742336273193, + "sampling/importance_sampling_ratio/mean": 1.0002944469451904, + "sampling/importance_sampling_ratio/min": 0.6139808297157288, + "sampling/sampling_logp_difference/max": 0.4877915382385254, + "sampling/sampling_logp_difference/mean": 0.0140210697427392, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 230.671875, + "completions/mean_terminated_length": 230.671875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.40689587593078613, + "epoch": 1.8112745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014784997343498536, + "kl": 0.025457806885242462, + "learning_rate": 4.0896979500218014e-07, + "loss": 0.0002, + "num_tokens": 46575942.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5478439331054688, + "sampling/importance_sampling_ratio/mean": 0.9997664093971252, + "sampling/importance_sampling_ratio/min": 0.7019744515419006, + "sampling/sampling_logp_difference/max": 0.4368629455566406, + "sampling/sampling_logp_difference/mean": 0.0150857949629426, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 215.640625, + "completions/mean_terminated_length": 215.640625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3654599189758301, + "epoch": 1.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017102247445561616, + "kl": 0.03623443841934204, + "learning_rate": 4.082694140931088e-07, + "loss": 0.0004, + "num_tokens": 46607343.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6253825426101685, + "sampling/importance_sampling_ratio/mean": 0.9996971487998962, + "sampling/importance_sampling_ratio/min": 0.4742974638938904, + "sampling/sampling_logp_difference/max": 0.7459206581115723, + "sampling/sampling_logp_difference/mean": 0.013516171835362911, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 185.03125, + "completions/mean_terminated_length": 185.03125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.388782799243927, + "epoch": 1.8137254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8009492885848263, + "kl": 0.046929892152547836, + "learning_rate": 4.0756921939131563e-07, + "loss": -0.0025, + "num_tokens": 46636369.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3943331241607666, + "sampling/importance_sampling_ratio/mean": 0.9998328685760498, + "sampling/importance_sampling_ratio/min": 0.5221091508865356, + "sampling/sampling_logp_difference/max": 0.6498786211013794, + "sampling/sampling_logp_difference/mean": 0.01529900822788477, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 209.4375, + "completions/mean_terminated_length": 209.4375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.505455493927002, + "epoch": 1.8149509803921569, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2395268664872279, + "kl": 0.05534018576145172, + "learning_rate": 4.0686921231815155e-07, + "loss": 0.01, + "num_tokens": 46667837.0, + "reward": 0.0625, + "reward_std": 0.5081988573074341, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6977012157440186, + "sampling/importance_sampling_ratio/mean": 0.998990535736084, + "sampling/importance_sampling_ratio/min": 0.5189392566680908, + "sampling/sampling_logp_difference/max": 0.655968427658081, + "sampling/sampling_logp_difference/mean": 0.018538698554039, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 185.359375, + "completions/mean_terminated_length": 185.359375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.35948121547698975, + "epoch": 1.8161764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02318190868571896, + "kl": 0.035717546939849854, + "learning_rate": 4.0616939429458627e-07, + "loss": 0.0004, + "num_tokens": 46694036.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6129748821258545, + "sampling/importance_sampling_ratio/mean": 0.9997787475585938, + "sampling/importance_sampling_ratio/min": 0.5944111943244934, + "sampling/sampling_logp_difference/max": 0.5201840400695801, + "sampling/sampling_logp_difference/mean": 0.015287473797798157, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 183.625, + "completions/mean_terminated_length": 183.625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.326416552066803, + "epoch": 1.8174019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018925789830675076, + "kl": 0.02713128924369812, + "learning_rate": 4.0546976674120623e-07, + "loss": 0.0003, + "num_tokens": 46724636.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5976483821868896, + "sampling/importance_sampling_ratio/mean": 1.0004806518554688, + "sampling/importance_sampling_ratio/min": 0.7305005192756653, + "sampling/sampling_logp_difference/max": 0.4685328006744385, + "sampling/sampling_logp_difference/mean": 0.013860877603292465, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 225.0625, + "completions/mean_terminated_length": 225.0625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4279215633869171, + "epoch": 1.8186274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0377987253461263, + "kl": 0.049170322716236115, + "learning_rate": 4.047703310782111e-07, + "loss": 0.013, + "num_tokens": 46762304.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6267346143722534, + "sampling/importance_sampling_ratio/mean": 1.0001940727233887, + "sampling/importance_sampling_ratio/min": 0.6485505104064941, + "sampling/sampling_logp_difference/max": 0.486574649810791, + "sampling/sampling_logp_difference/mean": 0.01635306142270565, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 212.25, + "completions/mean_terminated_length": 212.25, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.39828744530677795, + "epoch": 1.8198529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023462238047806762, + "kl": 0.03244244307279587, + "learning_rate": 4.0407108872541105e-07, + "loss": 0.0003, + "num_tokens": 46797776.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6390002965927124, + "sampling/importance_sampling_ratio/mean": 1.0000399351119995, + "sampling/importance_sampling_ratio/min": 0.6574017405509949, + "sampling/sampling_logp_difference/max": 0.49408650398254395, + "sampling/sampling_logp_difference/mean": 0.014169460162520409, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 185.625, + "completions/mean_terminated_length": 185.625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.37603577971458435, + "epoch": 1.821078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8349295274843194, + "kl": 0.026338636875152588, + "learning_rate": 4.0337204110222347e-07, + "loss": 0.0111, + "num_tokens": 46829304.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6245514154434204, + "sampling/importance_sampling_ratio/mean": 0.9991324543952942, + "sampling/importance_sampling_ratio/min": 0.6294820308685303, + "sampling/sampling_logp_difference/max": 0.4852316379547119, + "sampling/sampling_logp_difference/mean": 0.016620833426713943, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 196.140625, + "completions/mean_terminated_length": 196.140625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.348731130361557, + "epoch": 1.8223039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05716206847792711, + "kl": 0.04719793424010277, + "learning_rate": 4.0267318962767076e-07, + "loss": 0.0004, + "num_tokens": 46859553.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5071709156036377, + "sampling/importance_sampling_ratio/mean": 0.9999433755874634, + "sampling/importance_sampling_ratio/min": 0.6129412651062012, + "sampling/sampling_logp_difference/max": 0.4894862174987793, + "sampling/sampling_logp_difference/mean": 0.015518147498369217, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 159.546875, + "completions/mean_terminated_length": 159.546875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.306646466255188, + "epoch": 1.8235294117647058, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0361161905300358, + "kl": 0.03056943230330944, + "learning_rate": 4.0197453572037747e-07, + "loss": 0.0003, + "num_tokens": 46887892.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4020594358444214, + "sampling/importance_sampling_ratio/mean": 1.0001198053359985, + "sampling/importance_sampling_ratio/min": 0.621240496635437, + "sampling/sampling_logp_difference/max": 0.47603702545166016, + "sampling/sampling_logp_difference/mean": 0.012928012758493423, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 225.8125, + "completions/mean_terminated_length": 225.8125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.42293429374694824, + "epoch": 1.8247549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1095277333689655, + "kl": 0.03491745889186859, + "learning_rate": 4.0127608079856644e-07, + "loss": -0.0007, + "num_tokens": 46917544.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4753106832504272, + "sampling/importance_sampling_ratio/mean": 1.0002186298370361, + "sampling/importance_sampling_ratio/min": 0.47906020283699036, + "sampling/sampling_logp_difference/max": 0.735929012298584, + "sampling/sampling_logp_difference/mean": 0.01622503250837326, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 212.65625, + "completions/mean_terminated_length": 212.65625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.36998146772384644, + "epoch": 1.8259803921568627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016325980401073328, + "kl": 0.025385094806551933, + "learning_rate": 4.005778262800571e-07, + "loss": 0.0003, + "num_tokens": 46950994.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002012252807617, + "sampling/importance_sampling_ratio/min": 0.6067395210266113, + "sampling/sampling_logp_difference/max": 0.853581428527832, + "sampling/sampling_logp_difference/mean": 0.01505836471915245, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 196.78125, + "completions/mean_terminated_length": 196.78125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3660886883735657, + "epoch": 1.8272058823529411, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016446958710568508, + "kl": 0.023385537788271904, + "learning_rate": 3.9987977358226175e-07, + "loss": 0.0002, + "num_tokens": 46984164.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4265543222427368, + "sampling/importance_sampling_ratio/mean": 1.0011383295059204, + "sampling/importance_sampling_ratio/min": 0.6771497130393982, + "sampling/sampling_logp_difference/max": 0.38986289501190186, + "sampling/sampling_logp_difference/mean": 0.015456505119800568, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 237.671875, + "completions/mean_terminated_length": 237.671875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.530724048614502, + "epoch": 1.8284313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7448468760999643, + "kl": 0.04840172827243805, + "learning_rate": 3.991819241221835e-07, + "loss": 0.0322, + "num_tokens": 47031519.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4663141965866089, + "sampling/importance_sampling_ratio/mean": 0.9994192719459534, + "sampling/importance_sampling_ratio/min": 0.6312842965126038, + "sampling/sampling_logp_difference/max": 0.4599989652633667, + "sampling/sampling_logp_difference/mean": 0.017024997621774673, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 239.78125, + "completions/mean_terminated_length": 239.78125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.47953343391418457, + "epoch": 1.829656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017750015634154247, + "kl": 0.03577635809779167, + "learning_rate": 3.98484279316412e-07, + "loss": 0.0004, + "num_tokens": 47068321.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4650589227676392, + "sampling/importance_sampling_ratio/mean": 1.0002245903015137, + "sampling/importance_sampling_ratio/min": 0.6398684978485107, + "sampling/sampling_logp_difference/max": 0.44649267196655273, + "sampling/sampling_logp_difference/mean": 0.016227155923843384, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 199.40625, + "completions/mean_terminated_length": 199.40625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3214770555496216, + "epoch": 1.8308823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017367003899370518, + "kl": 0.02642001025378704, + "learning_rate": 3.977868405811223e-07, + "loss": 0.0003, + "num_tokens": 47096475.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6212598085403442, + "sampling/importance_sampling_ratio/mean": 1.0001227855682373, + "sampling/importance_sampling_ratio/min": 0.6142379641532898, + "sampling/sampling_logp_difference/max": 0.48737287521362305, + "sampling/sampling_logp_difference/mean": 0.01426103338599205, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 202.390625, + "completions/mean_terminated_length": 202.390625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.36925238370895386, + "epoch": 1.8321078431372548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01574239035710109, + "kl": 0.023774582892656326, + "learning_rate": 3.970896093320708e-07, + "loss": 0.0002, + "num_tokens": 47127012.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.326669692993164, + "sampling/importance_sampling_ratio/mean": 0.9998118877410889, + "sampling/importance_sampling_ratio/min": 0.5278945565223694, + "sampling/sampling_logp_difference/max": 0.6388587951660156, + "sampling/sampling_logp_difference/mean": 0.014183331280946732, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 194.53125, + "completions/mean_terminated_length": 194.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.37928807735443115, + "epoch": 1.8333333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01891790696330377, + "kl": 0.02853918820619583, + "learning_rate": 3.9639258698459287e-07, + "loss": 0.0003, + "num_tokens": 47157222.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4152483940124512, + "sampling/importance_sampling_ratio/mean": 1.000333309173584, + "sampling/importance_sampling_ratio/min": 0.6547790765762329, + "sampling/sampling_logp_difference/max": 0.42345738410949707, + "sampling/sampling_logp_difference/mean": 0.014429192990064621, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 230.171875, + "completions/mean_terminated_length": 230.171875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.29474058747291565, + "epoch": 1.8345588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016042095422497977, + "kl": 0.020352240651845932, + "learning_rate": 3.9569577495359964e-07, + "loss": 0.0002, + "num_tokens": 47190641.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 0.9997077584266663, + "sampling/importance_sampling_ratio/min": 0.6276949048042297, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.011500919237732887, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 203.578125, + "completions/mean_terminated_length": 203.578125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.47230902314186096, + "epoch": 1.8357843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015473087539015013, + "kl": 0.028497405350208282, + "learning_rate": 3.949991746535753e-07, + "loss": 0.0003, + "num_tokens": 47220838.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.358568787574768, + "sampling/importance_sampling_ratio/mean": 0.9996574521064758, + "sampling/importance_sampling_ratio/min": 0.5100560188293457, + "sampling/sampling_logp_difference/max": 0.6732347011566162, + "sampling/sampling_logp_difference/mean": 0.01717330515384674, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 203.328125, + "completions/mean_terminated_length": 203.328125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3301214277744293, + "epoch": 1.8370098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017093617533739433, + "kl": 0.026626810431480408, + "learning_rate": 3.943027874985746e-07, + "loss": 0.0003, + "num_tokens": 47254683.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5623085498809814, + "sampling/importance_sampling_ratio/mean": 1.0003242492675781, + "sampling/importance_sampling_ratio/min": 0.6190605163574219, + "sampling/sampling_logp_difference/max": 0.4795522689819336, + "sampling/sampling_logp_difference/mean": 0.013283345848321915, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 206.140625, + "completions/mean_terminated_length": 206.140625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3752307891845703, + "epoch": 1.8382352941176472, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015116664729821374, + "kl": 0.023343995213508606, + "learning_rate": 3.9360661490221904e-07, + "loss": 0.0002, + "num_tokens": 47293956.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6121180057525635, + "sampling/importance_sampling_ratio/mean": 0.9999548196792603, + "sampling/importance_sampling_ratio/min": 0.6824631690979004, + "sampling/sampling_logp_difference/max": 0.47754883766174316, + "sampling/sampling_logp_difference/mean": 0.01394678931683302, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 214.578125, + "completions/mean_terminated_length": 214.578125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.31524908542633057, + "epoch": 1.8394607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7801255082675378, + "kl": 0.030825018882751465, + "learning_rate": 3.929106582776948e-07, + "loss": 0.0721, + "num_tokens": 47325033.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.639209508895874, + "sampling/importance_sampling_ratio/mean": 0.9999526143074036, + "sampling/importance_sampling_ratio/min": 0.6328410506248474, + "sampling/sampling_logp_difference/max": 0.4942140579223633, + "sampling/sampling_logp_difference/mean": 0.013170123100280762, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 183.8125, + "completions/mean_terminated_length": 183.8125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4793202877044678, + "epoch": 1.840686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8663195262513032, + "kl": 0.04631558060646057, + "learning_rate": 3.9221491903775013e-07, + "loss": -0.031, + "num_tokens": 47358893.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.606947898864746, + "sampling/importance_sampling_ratio/mean": 1.0002026557922363, + "sampling/importance_sampling_ratio/min": 0.5907023549079895, + "sampling/sampling_logp_difference/max": 0.5264430046081543, + "sampling/sampling_logp_difference/mean": 0.01629520207643509, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 210.0, + "completions/mean_terminated_length": 210.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4045953154563904, + "epoch": 1.8419117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7631893991916184, + "kl": 0.04685451462864876, + "learning_rate": 3.9151939859469166e-07, + "loss": 0.0155, + "num_tokens": 47391165.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.2985237836837769, + "sampling/importance_sampling_ratio/mean": 1.0000450611114502, + "sampling/importance_sampling_ratio/min": 0.6058517098426819, + "sampling/sampling_logp_difference/max": 0.5011200904846191, + "sampling/sampling_logp_difference/mean": 0.01413068175315857, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 192.03125, + "completions/mean_terminated_length": 192.03125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.34464895725250244, + "epoch": 1.843137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01662504791497207, + "kl": 0.025617942214012146, + "learning_rate": 3.908240983603813e-07, + "loss": 0.0003, + "num_tokens": 47422383.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4113905429840088, + "sampling/importance_sampling_ratio/mean": 0.9997320175170898, + "sampling/importance_sampling_ratio/min": 0.62844318151474, + "sampling/sampling_logp_difference/max": 0.4645097255706787, + "sampling/sampling_logp_difference/mean": 0.013034269213676453, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 210.8125, + "completions/mean_terminated_length": 210.8125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3480450510978699, + "epoch": 1.844362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7958356948308606, + "kl": 0.03220169246196747, + "learning_rate": 3.9012901974623476e-07, + "loss": 0.0099, + "num_tokens": 47450707.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3982449769973755, + "sampling/importance_sampling_ratio/mean": 0.9996621608734131, + "sampling/importance_sampling_ratio/min": 0.6385641098022461, + "sampling/sampling_logp_difference/max": 0.448533296585083, + "sampling/sampling_logp_difference/mean": 0.013606126420199871, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 195.6875, + "completions/mean_terminated_length": 195.6875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.453193336725235, + "epoch": 1.8455882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9597914881370286, + "kl": 0.04644181951880455, + "learning_rate": 3.894341641632176e-07, + "loss": -0.0137, + "num_tokens": 47486703.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5226556062698364, + "sampling/importance_sampling_ratio/mean": 1.0002065896987915, + "sampling/importance_sampling_ratio/min": 0.6109529137611389, + "sampling/sampling_logp_difference/max": 0.4927353858947754, + "sampling/sampling_logp_difference/mean": 0.016492381691932678, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 253.53125, + "completions/mean_terminated_length": 253.53125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.41846609115600586, + "epoch": 1.846813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019047337485516454, + "kl": 0.04174449294805527, + "learning_rate": 3.8873953302184283e-07, + "loss": 0.0004, + "num_tokens": 47522833.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3754518032073975, + "sampling/importance_sampling_ratio/mean": 0.999981701374054, + "sampling/importance_sampling_ratio/min": 0.6383336186408997, + "sampling/sampling_logp_difference/max": 0.4488942623138428, + "sampling/sampling_logp_difference/mean": 0.014663152396678925, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 226.265625, + "completions/mean_terminated_length": 226.265625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.36275559663772583, + "epoch": 1.8480392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0279651867417821, + "kl": 0.031749993562698364, + "learning_rate": 3.880451277321673e-07, + "loss": 0.0003, + "num_tokens": 47555106.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.424131155014038, + "sampling/importance_sampling_ratio/mean": 0.9998042583465576, + "sampling/importance_sampling_ratio/min": 0.56624436378479, + "sampling/sampling_logp_difference/max": 0.5687295198440552, + "sampling/sampling_logp_difference/mean": 0.014154046773910522, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 155.078125, + "completions/mean_terminated_length": 155.078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.31655555963516235, + "epoch": 1.8492647058823528, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021751454697667986, + "kl": 0.026957426220178604, + "learning_rate": 3.873509497037899e-07, + "loss": 0.0003, + "num_tokens": 47584391.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4381095170974731, + "sampling/importance_sampling_ratio/mean": 1.0002214908599854, + "sampling/importance_sampling_ratio/min": 0.6433576941490173, + "sampling/sampling_logp_difference/max": 0.44105446338653564, + "sampling/sampling_logp_difference/mean": 0.012428762391209602, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 208.6875, + "completions/mean_terminated_length": 208.6875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.32220640778541565, + "epoch": 1.8504901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015277402866472353, + "kl": 0.024374518543481827, + "learning_rate": 3.8665700034584834e-07, + "loss": 0.0002, + "num_tokens": 47617091.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.575573205947876, + "sampling/importance_sampling_ratio/mean": 0.9997588992118835, + "sampling/importance_sampling_ratio/min": 0.5096511840820312, + "sampling/sampling_logp_difference/max": 0.674028754234314, + "sampling/sampling_logp_difference/mean": 0.01380470022559166, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 165.578125, + "completions/mean_terminated_length": 165.578125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.33295172452926636, + "epoch": 1.8517156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05001211305223302, + "kl": 0.04372579604387283, + "learning_rate": 3.8596328106701533e-07, + "loss": 0.0004, + "num_tokens": 47641656.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.554552435874939, + "sampling/importance_sampling_ratio/mean": 1.0006695985794067, + "sampling/importance_sampling_ratio/min": 0.6678668260574341, + "sampling/sampling_logp_difference/max": 0.44118762016296387, + "sampling/sampling_logp_difference/mean": 0.014502106234431267, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 186.90625, + "completions/mean_terminated_length": 186.90625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4239566922187805, + "epoch": 1.8529411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023184292748248538, + "kl": 0.03840119391679764, + "learning_rate": 3.8526979327549736e-07, + "loss": 0.0004, + "num_tokens": 47676370.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5825395584106445, + "sampling/importance_sampling_ratio/mean": 1.000715732574463, + "sampling/importance_sampling_ratio/min": 0.6703637838363647, + "sampling/sampling_logp_difference/max": 0.4590308666229248, + "sampling/sampling_logp_difference/mean": 0.015674592927098274, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 195.53125, + "completions/mean_terminated_length": 195.53125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.35997116565704346, + "epoch": 1.8541666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8710103041200632, + "kl": 0.03831769526004791, + "learning_rate": 3.845765383790306e-07, + "loss": 0.0008, + "num_tokens": 47704772.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.3383262157440186, + "sampling/importance_sampling_ratio/mean": 0.9996328949928284, + "sampling/importance_sampling_ratio/min": 0.6200441718101501, + "sampling/sampling_logp_difference/max": 0.4779646396636963, + "sampling/sampling_logp_difference/mean": 0.015365565195679665, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 216.265625, + "completions/mean_terminated_length": 216.265625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.4190855622291565, + "epoch": 1.8553921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01935169307066362, + "kl": 0.03445492312312126, + "learning_rate": 3.8388351778487875e-07, + "loss": 0.0004, + "num_tokens": 47737845.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4253652095794678, + "sampling/importance_sampling_ratio/mean": 0.9997737407684326, + "sampling/importance_sampling_ratio/min": 0.7136714458465576, + "sampling/sampling_logp_difference/max": 0.3544280529022217, + "sampling/sampling_logp_difference/mean": 0.0158874299377203, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 201.796875, + "completions/mean_terminated_length": 201.796875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.38299238681793213, + "epoch": 1.8566176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015362492473837216, + "kl": 0.02534967102110386, + "learning_rate": 3.831907328998295e-07, + "loss": 0.0003, + "num_tokens": 47770328.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4909775257110596, + "sampling/importance_sampling_ratio/mean": 1.0000851154327393, + "sampling/importance_sampling_ratio/min": 0.7270282506942749, + "sampling/sampling_logp_difference/max": 0.3994319438934326, + "sampling/sampling_logp_difference/mean": 0.015265097841620445, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 195.4375, + "completions/mean_terminated_length": 195.4375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3300290107727051, + "epoch": 1.857843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017287688114681694, + "kl": 0.030581336468458176, + "learning_rate": 3.824981851301924e-07, + "loss": 0.0003, + "num_tokens": 47800260.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6193667650222778, + "sampling/importance_sampling_ratio/mean": 1.0000019073486328, + "sampling/importance_sampling_ratio/min": 0.6434445977210999, + "sampling/sampling_logp_difference/max": 0.48203516006469727, + "sampling/sampling_logp_difference/mean": 0.013907796703279018, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 203.0, + "completions/mean_terminated_length": 203.0, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.39566364884376526, + "epoch": 1.8590686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021384363972712102, + "kl": 0.03867260366678238, + "learning_rate": 3.818058758817955e-07, + "loss": 0.0003, + "num_tokens": 47833652.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4483669996261597, + "sampling/importance_sampling_ratio/mean": 0.9995390176773071, + "sampling/importance_sampling_ratio/min": 0.6335282921791077, + "sampling/sampling_logp_difference/max": 0.4564507007598877, + "sampling/sampling_logp_difference/mean": 0.015130220912396908, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 215.140625, + "completions/mean_terminated_length": 215.140625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.37887823581695557, + "epoch": 1.8602941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1560713613113136, + "kl": 0.03132549673318863, + "learning_rate": 3.81113806559983e-07, + "loss": -0.0081, + "num_tokens": 47863229.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4703065156936646, + "sampling/importance_sampling_ratio/mean": 1.0002840757369995, + "sampling/importance_sampling_ratio/min": 0.6794388890266418, + "sampling/sampling_logp_difference/max": 0.3864879608154297, + "sampling/sampling_logp_difference/mean": 0.015126525424420834, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 161.578125, + "completions/mean_terminated_length": 161.578125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4315345585346222, + "epoch": 1.8615196078431373, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.42162824891901, + "kl": 0.11316701024770737, + "learning_rate": 3.804219785696113e-07, + "loss": 0.0433, + "num_tokens": 47887698.0, + "reward": 0.21875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.283816933631897, + "sampling/importance_sampling_ratio/mean": 0.999412477016449, + "sampling/importance_sampling_ratio/min": 0.608467698097229, + "sampling/sampling_logp_difference/max": 0.4968113899230957, + "sampling/sampling_logp_difference/mean": 0.016700876876711845, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 164.828125, + "completions/mean_terminated_length": 164.828125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.30290788412094116, + "epoch": 1.8627450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018151543106306196, + "kl": 0.026134612038731575, + "learning_rate": 3.797303933150475e-07, + "loss": 0.0002, + "num_tokens": 47912551.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5077170133590698, + "sampling/importance_sampling_ratio/mean": 1.00059974193573, + "sampling/importance_sampling_ratio/min": 0.6030475497245789, + "sampling/sampling_logp_difference/max": 0.5057592391967773, + "sampling/sampling_logp_difference/mean": 0.013300842605531216, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 235.734375, + "completions/mean_terminated_length": 235.734375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3396828770637512, + "epoch": 1.8639705882352942, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01272104473429735, + "kl": 0.02101752534508705, + "learning_rate": 3.790390522001662e-07, + "loss": 0.0002, + "num_tokens": 47949382.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8631864786148071, + "sampling/importance_sampling_ratio/mean": 1.00046706199646, + "sampling/importance_sampling_ratio/min": 0.6420100927352905, + "sampling/sampling_logp_difference/max": 0.6222882270812988, + "sampling/sampling_logp_difference/mean": 0.01321179885417223, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 222.359375, + "completions/mean_terminated_length": 222.359375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3400956392288208, + "epoch": 1.8651960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0161972383848884, + "kl": 0.026999952271580696, + "learning_rate": 3.7834795662834566e-07, + "loss": 0.0002, + "num_tokens": 47980733.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.644083023071289, + "sampling/importance_sampling_ratio/mean": 1.000594973564148, + "sampling/importance_sampling_ratio/min": 0.5639978647232056, + "sampling/sampling_logp_difference/max": 0.5727047920227051, + "sampling/sampling_logp_difference/mean": 0.013366002589464188, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 286.015625, + "completions/mean_terminated_length": 286.015625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.47846367955207825, + "epoch": 1.866421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5874284622063278, + "kl": 0.025258827954530716, + "learning_rate": 3.776571080024663e-07, + "loss": 0.0091, + "num_tokens": 48022814.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4995115995407104, + "sampling/importance_sampling_ratio/mean": 1.0000669956207275, + "sampling/importance_sampling_ratio/min": 0.6933954954147339, + "sampling/sampling_logp_difference/max": 0.4051394462585449, + "sampling/sampling_logp_difference/mean": 0.015736836940050125, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 230.796875, + "completions/mean_terminated_length": 230.796875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3768925666809082, + "epoch": 1.8676470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015557378134246427, + "kl": 0.02143799141049385, + "learning_rate": 3.76966507724907e-07, + "loss": 0.0002, + "num_tokens": 48062049.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4400556087493896, + "sampling/importance_sampling_ratio/mean": 1.000324010848999, + "sampling/importance_sampling_ratio/min": 0.6300141215324402, + "sampling/sampling_logp_difference/max": 0.46201300621032715, + "sampling/sampling_logp_difference/mean": 0.01351095736026764, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 262.21875, + "completions/mean_terminated_length": 262.21875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.47479164600372314, + "epoch": 1.8688725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7609147646390458, + "kl": 0.03068857453763485, + "learning_rate": 3.762761571975429e-07, + "loss": -0.0246, + "num_tokens": 48101055.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4294888973236084, + "sampling/importance_sampling_ratio/mean": 1.0001832246780396, + "sampling/importance_sampling_ratio/min": 0.6955002546310425, + "sampling/sampling_logp_difference/max": 0.36312389373779297, + "sampling/sampling_logp_difference/mean": 0.01597488485276699, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 190.15625, + "completions/mean_terminated_length": 190.15625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3384573459625244, + "epoch": 1.8700980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022967983325005957, + "kl": 0.04190897196531296, + "learning_rate": 3.755860578217413e-07, + "loss": 0.0005, + "num_tokens": 48132617.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6337945461273193, + "sampling/importance_sampling_ratio/mean": 0.99971604347229, + "sampling/importance_sampling_ratio/min": 0.677150309085846, + "sampling/sampling_logp_difference/max": 0.4909052848815918, + "sampling/sampling_logp_difference/mean": 0.01371623296290636, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 208.25, + "completions/mean_terminated_length": 208.25, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3548075556755066, + "epoch": 1.8713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016443605834087695, + "kl": 0.026354052126407623, + "learning_rate": 3.7489621099836043e-07, + "loss": 0.0003, + "num_tokens": 48162521.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4063249826431274, + "sampling/importance_sampling_ratio/mean": 0.9999675750732422, + "sampling/importance_sampling_ratio/min": 0.7368854284286499, + "sampling/sampling_logp_difference/max": 0.3409799337387085, + "sampling/sampling_logp_difference/mean": 0.013621061109006405, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 213.75, + "completions/mean_terminated_length": 213.75, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3396419286727905, + "epoch": 1.8725490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03021546402426761, + "kl": 0.035038430243730545, + "learning_rate": 3.742066181277457e-07, + "loss": 0.0003, + "num_tokens": 48197305.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7287288904190063, + "sampling/importance_sampling_ratio/mean": 1.000309944152832, + "sampling/importance_sampling_ratio/min": 0.6089730858802795, + "sampling/sampling_logp_difference/max": 0.5473864078521729, + "sampling/sampling_logp_difference/mean": 0.013034914620220661, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 215.859375, + "completions/mean_terminated_length": 215.859375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.4406580924987793, + "epoch": 1.8737745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9237220374056536, + "kl": 0.04749206081032753, + "learning_rate": 3.735172806097271e-07, + "loss": 0.0224, + "num_tokens": 48231536.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.53892982006073, + "sampling/importance_sampling_ratio/mean": 0.9993038177490234, + "sampling/importance_sampling_ratio/min": 0.6317639350891113, + "sampling/sampling_logp_difference/max": 0.45923948287963867, + "sampling/sampling_logp_difference/mean": 0.015829021111130714, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 195.328125, + "completions/mean_terminated_length": 195.328125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3406910300254822, + "epoch": 1.875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6532458550494024, + "kl": 0.026171743869781494, + "learning_rate": 3.7282819984361577e-07, + "loss": 0.0028, + "num_tokens": 48261893.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5282044410705566, + "sampling/importance_sampling_ratio/mean": 0.9998431205749512, + "sampling/importance_sampling_ratio/min": 0.6920354962348938, + "sampling/sampling_logp_difference/max": 0.42409348487854004, + "sampling/sampling_logp_difference/mean": 0.01412119995802641, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 205.859375, + "completions/mean_terminated_length": 205.859375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.36345553398132324, + "epoch": 1.8762254901960784, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.805099297675102, + "kl": 0.027597036212682724, + "learning_rate": 3.721393772282022e-07, + "loss": -0.0021, + "num_tokens": 48290988.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4377270936965942, + "sampling/importance_sampling_ratio/mean": 1.0010266304016113, + "sampling/importance_sampling_ratio/min": 0.6303300857543945, + "sampling/sampling_logp_difference/max": 0.46151161193847656, + "sampling/sampling_logp_difference/mean": 0.015081214718520641, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 196.34375, + "completions/mean_terminated_length": 196.34375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.42034146189689636, + "epoch": 1.8774509803921569, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9478155692519119, + "kl": 0.02742549404501915, + "learning_rate": 3.7145081416175264e-07, + "loss": 0.0004, + "num_tokens": 48321250.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4818246364593506, + "sampling/importance_sampling_ratio/mean": 0.9998699426651001, + "sampling/importance_sampling_ratio/min": 0.7189988493919373, + "sampling/sampling_logp_difference/max": 0.393274188041687, + "sampling/sampling_logp_difference/mean": 0.014403371140360832, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 200.125, + "completions/mean_terminated_length": 200.125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.425557017326355, + "epoch": 1.8786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026734386894002888, + "kl": 0.053220491856336594, + "learning_rate": 3.7076251204200667e-07, + "loss": 0.0005, + "num_tokens": 48352794.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007447242736816, + "sampling/importance_sampling_ratio/mean": 0.9991117715835571, + "sampling/importance_sampling_ratio/min": 0.7116121053695679, + "sampling/sampling_logp_difference/max": 0.47046899795532227, + "sampling/sampling_logp_difference/mean": 0.014125547371804714, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 182.671875, + "completions/mean_terminated_length": 182.671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3695541024208069, + "epoch": 1.8799019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018940723807235122, + "kl": 0.03589736297726631, + "learning_rate": 3.700744722661736e-07, + "loss": 0.0004, + "num_tokens": 48378533.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5425060987472534, + "sampling/importance_sampling_ratio/mean": 0.9998379945755005, + "sampling/importance_sampling_ratio/min": 0.6880703568458557, + "sampling/sampling_logp_difference/max": 0.4334084987640381, + "sampling/sampling_logp_difference/mean": 0.01465071551501751, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.37948712706565857, + "epoch": 1.8811274509803921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018318198725694557, + "kl": 0.028661314398050308, + "learning_rate": 3.693866962309308e-07, + "loss": 0.0003, + "num_tokens": 48409437.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5045616626739502, + "sampling/importance_sampling_ratio/mean": 0.999610185623169, + "sampling/importance_sampling_ratio/min": 0.5768758654594421, + "sampling/sampling_logp_difference/max": 0.5501282215118408, + "sampling/sampling_logp_difference/mean": 0.015293458476662636, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 185.578125, + "completions/mean_terminated_length": 185.578125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.5331075191497803, + "epoch": 1.8823529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05196876691264794, + "kl": 0.07588602602481842, + "learning_rate": 3.686991853324202e-07, + "loss": 0.0008, + "num_tokens": 48440530.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5011667013168335, + "sampling/importance_sampling_ratio/mean": 0.999535322189331, + "sampling/importance_sampling_ratio/min": 0.6088920831680298, + "sampling/sampling_logp_difference/max": 0.49611425399780273, + "sampling/sampling_logp_difference/mean": 0.018779968842864037, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 217.0625, + "completions/mean_terminated_length": 217.0625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.32210832834243774, + "epoch": 1.883578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017685293873126322, + "kl": 0.02645273134112358, + "learning_rate": 3.680119409662451e-07, + "loss": 0.0002, + "num_tokens": 48471222.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.635670781135559, + "sampling/importance_sampling_ratio/mean": 1.0004723072052002, + "sampling/importance_sampling_ratio/min": 0.6056347489356995, + "sampling/sampling_logp_difference/max": 0.5014781951904297, + "sampling/sampling_logp_difference/mean": 0.012803269550204277, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 208.4375, + "completions/mean_terminated_length": 208.4375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.39927926659584045, + "epoch": 1.8848039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015047651396016628, + "kl": 0.02834579348564148, + "learning_rate": 3.673249645274682e-07, + "loss": 0.0003, + "num_tokens": 48502898.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5285691022872925, + "sampling/importance_sampling_ratio/mean": 1.0005877017974854, + "sampling/importance_sampling_ratio/min": 0.6625506281852722, + "sampling/sampling_logp_difference/max": 0.4243321418762207, + "sampling/sampling_logp_difference/mean": 0.014671064913272858, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 187.171875, + "completions/mean_terminated_length": 187.171875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4136110544204712, + "epoch": 1.8860294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9895108502647249, + "kl": 0.036319661885499954, + "learning_rate": 3.6663825741060805e-07, + "loss": 0.0123, + "num_tokens": 48534957.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.844038486480713, + "sampling/importance_sampling_ratio/mean": 0.9997587203979492, + "sampling/importance_sampling_ratio/min": 0.6100226044654846, + "sampling/sampling_logp_difference/max": 0.6119580268859863, + "sampling/sampling_logp_difference/mean": 0.016377631574869156, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 213.390625, + "completions/mean_terminated_length": 213.390625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.4372606575489044, + "epoch": 1.8872549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9155552613239852, + "kl": 0.03465927392244339, + "learning_rate": 3.6595182100963686e-07, + "loss": -0.0025, + "num_tokens": 48564294.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5823155641555786, + "sampling/importance_sampling_ratio/mean": 1.0003228187561035, + "sampling/importance_sampling_ratio/min": 0.6966019868850708, + "sampling/sampling_logp_difference/max": 0.4588892459869385, + "sampling/sampling_logp_difference/mean": 0.015279426239430904, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 201.28125, + "completions/mean_terminated_length": 201.28125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.44704121351242065, + "epoch": 1.8884803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0935355314855688, + "kl": 0.05873614177107811, + "learning_rate": 3.652656567179765e-07, + "loss": 0.0051, + "num_tokens": 48592648.0, + "reward": 0.46875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3280521631240845, + "sampling/importance_sampling_ratio/mean": 1.0003440380096436, + "sampling/importance_sampling_ratio/min": 0.7045369744300842, + "sampling/sampling_logp_difference/max": 0.35021448135375977, + "sampling/sampling_logp_difference/mean": 0.016065191477537155, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 199.71875, + "completions/mean_terminated_length": 199.71875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3954826891422272, + "epoch": 1.8897058823529411, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6614945317414733, + "kl": 0.024616148322820663, + "learning_rate": 3.645797659284975e-07, + "loss": -0.0268, + "num_tokens": 48620118.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.43571937084198, + "sampling/importance_sampling_ratio/mean": 1.0004339218139648, + "sampling/importance_sampling_ratio/min": 0.6173369288444519, + "sampling/sampling_logp_difference/max": 0.48234033584594727, + "sampling/sampling_logp_difference/mean": 0.014273724518716335, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 201.890625, + "completions/mean_terminated_length": 201.890625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4209592938423157, + "epoch": 1.8909313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015838906829838583, + "kl": 0.03292512893676758, + "learning_rate": 3.638941500335144e-07, + "loss": 0.0003, + "num_tokens": 48650751.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.448422908782959, + "sampling/importance_sampling_ratio/mean": 1.0003647804260254, + "sampling/importance_sampling_ratio/min": 0.6615846753120422, + "sampling/sampling_logp_difference/max": 0.41311728954315186, + "sampling/sampling_logp_difference/mean": 0.014716507866978645, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 236.109375, + "completions/mean_terminated_length": 236.109375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.4740673005580902, + "epoch": 1.892156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.043215480833453, + "kl": 0.047959618270397186, + "learning_rate": 3.6320881042478433e-07, + "loss": -0.0196, + "num_tokens": 48681990.0, + "reward": 0.34375, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6622728109359741, + "sampling/importance_sampling_ratio/mean": 1.0003360509872437, + "sampling/importance_sampling_ratio/min": 0.626307487487793, + "sampling/sampling_logp_difference/max": 0.508185863494873, + "sampling/sampling_logp_difference/mean": 0.016803443431854248, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 160.515625, + "completions/mean_terminated_length": 160.515625, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "entropy": 0.342548668384552, + "epoch": 1.8933823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022317383834636773, + "kl": 0.03253442049026489, + "learning_rate": 3.6252374849350303e-07, + "loss": 0.0003, + "num_tokens": 48710743.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9671460390090942, + "sampling/importance_sampling_ratio/mean": 0.9998558759689331, + "sampling/importance_sampling_ratio/min": 0.5887016654014587, + "sampling/sampling_logp_difference/max": 0.6765837669372559, + "sampling/sampling_logp_difference/mean": 0.014454076066613197, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 203.40625, + "completions/mean_terminated_length": 203.40625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3158635199069977, + "epoch": 1.8946078431372548, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7180401122204123, + "kl": 0.02597656473517418, + "learning_rate": 3.618389656303029e-07, + "loss": -0.0126, + "num_tokens": 48742561.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4308671951293945, + "sampling/importance_sampling_ratio/mean": 1.0004360675811768, + "sampling/importance_sampling_ratio/min": 0.6622448563575745, + "sampling/sampling_logp_difference/max": 0.41211986541748047, + "sampling/sampling_logp_difference/mean": 0.01251291949301958, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 213.203125, + "completions/mean_terminated_length": 213.203125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.4119238257408142, + "epoch": 1.8958333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023738719517323002, + "kl": 0.04377554729580879, + "learning_rate": 3.6115446322525e-07, + "loss": 0.0005, + "num_tokens": 48777742.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4875625371932983, + "sampling/importance_sampling_ratio/mean": 0.9997202754020691, + "sampling/importance_sampling_ratio/min": 0.722936749458313, + "sampling/sampling_logp_difference/max": 0.3971388339996338, + "sampling/sampling_logp_difference/mean": 0.01472897082567215, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 261.125, + "completions/mean_terminated_length": 261.125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4818275272846222, + "epoch": 1.8970588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7594003267221461, + "kl": 0.030104786157608032, + "learning_rate": 3.6047024266784035e-07, + "loss": 0.006, + "num_tokens": 48824262.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5562759637832642, + "sampling/importance_sampling_ratio/mean": 1.0002262592315674, + "sampling/importance_sampling_ratio/min": 0.607857882976532, + "sampling/sampling_logp_difference/max": 0.4978141784667969, + "sampling/sampling_logp_difference/mean": 0.015647035092115402, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 232.296875, + "completions/mean_terminated_length": 232.296875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.43514683842658997, + "epoch": 1.8982843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023241149598766817, + "kl": 0.04113928601145744, + "learning_rate": 3.5978630534699865e-07, + "loss": 0.0004, + "num_tokens": 48859977.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.692874550819397, + "sampling/importance_sampling_ratio/mean": 1.0002148151397705, + "sampling/importance_sampling_ratio/min": 0.6567551493644714, + "sampling/sampling_logp_difference/max": 0.5264279842376709, + "sampling/sampling_logp_difference/mean": 0.015478353947401047, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 225.265625, + "completions/mean_terminated_length": 225.265625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.38786888122558594, + "epoch": 1.8995098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7722707835378005, + "kl": 0.028615374118089676, + "learning_rate": 3.591026526510742e-07, + "loss": -0.0096, + "num_tokens": 48895546.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.2822431325912476, + "sampling/importance_sampling_ratio/mean": 0.9998630881309509, + "sampling/importance_sampling_ratio/min": 0.5943406224250793, + "sampling/sampling_logp_difference/max": 0.5203027725219727, + "sampling/sampling_logp_difference/mean": 0.013558058068156242, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 233.75, + "completions/mean_terminated_length": 233.75, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.41978776454925537, + "epoch": 1.9007352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9088531463325852, + "kl": 0.034309446811676025, + "learning_rate": 3.584192859678391e-07, + "loss": 0.0111, + "num_tokens": 48927754.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.3229866027832031, + "sampling/importance_sampling_ratio/mean": 0.9999614357948303, + "sampling/importance_sampling_ratio/min": 0.7101708650588989, + "sampling/sampling_logp_difference/max": 0.3422497510910034, + "sampling/sampling_logp_difference/mean": 0.014441236853599548, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 233.421875, + "completions/mean_terminated_length": 233.421875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.46079522371292114, + "epoch": 1.9019607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7475422481338134, + "kl": 0.030957430601119995, + "learning_rate": 3.577362066844838e-07, + "loss": -0.0197, + "num_tokens": 48960805.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.409862756729126, + "sampling/importance_sampling_ratio/mean": 0.999723494052887, + "sampling/importance_sampling_ratio/min": 0.636890172958374, + "sampling/sampling_logp_difference/max": 0.4511580467224121, + "sampling/sampling_logp_difference/mean": 0.016117535531520844, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 173.0625, + "completions/mean_terminated_length": 173.0625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4062950611114502, + "epoch": 1.903186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0208390156543426, + "kl": 0.03014496900141239, + "learning_rate": 3.570534161876163e-07, + "loss": 0.0003, + "num_tokens": 48986633.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3640347719192505, + "sampling/importance_sampling_ratio/mean": 1.000274658203125, + "sampling/importance_sampling_ratio/min": 0.6374650001525879, + "sampling/sampling_logp_difference/max": 0.4502559304237366, + "sampling/sampling_logp_difference/mean": 0.015687409788370132, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 246.109375, + "completions/mean_terminated_length": 246.109375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.387340247631073, + "epoch": 1.9044117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015121762657977288, + "kl": 0.025227148085832596, + "learning_rate": 3.5637091586325796e-07, + "loss": 0.0002, + "num_tokens": 49025280.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6264692544937134, + "sampling/importance_sampling_ratio/mean": 0.9998824596405029, + "sampling/importance_sampling_ratio/min": 0.5149549841880798, + "sampling/sampling_logp_difference/max": 0.6636757850646973, + "sampling/sampling_logp_difference/mean": 0.014368398115038872, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 237.859375, + "completions/mean_terminated_length": 237.859375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.39210081100463867, + "epoch": 1.905637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01640127218729267, + "kl": 0.02905341237783432, + "learning_rate": 3.556887070968414e-07, + "loss": 0.0003, + "num_tokens": 49058775.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.514710545539856, + "sampling/importance_sampling_ratio/mean": 0.9997396469116211, + "sampling/importance_sampling_ratio/min": 0.69562166929245, + "sampling/sampling_logp_difference/max": 0.4152243137359619, + "sampling/sampling_logp_difference/mean": 0.014183616265654564, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 239.984375, + "completions/mean_terminated_length": 239.984375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.41489022970199585, + "epoch": 1.906862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018190887281895423, + "kl": 0.03981224074959755, + "learning_rate": 3.550067912732069e-07, + "loss": 0.0004, + "num_tokens": 49094774.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.63120436668396, + "sampling/importance_sampling_ratio/mean": 0.9999634027481079, + "sampling/importance_sampling_ratio/min": 0.6876845955848694, + "sampling/sampling_logp_difference/max": 0.4893186092376709, + "sampling/sampling_logp_difference/mean": 0.01440738607198, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 253.5625, + "completions/mean_terminated_length": 253.5625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.43130192160606384, + "epoch": 1.9080882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7058363093624886, + "kl": 0.025736529380083084, + "learning_rate": 3.5432516977660054e-07, + "loss": 0.0235, + "num_tokens": 49128506.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996404647827148, + "sampling/importance_sampling_ratio/min": 0.6482198238372803, + "sampling/sampling_logp_difference/max": 0.7257037162780762, + "sampling/sampling_logp_difference/mean": 0.014149620197713375, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 238.890625, + "completions/mean_terminated_length": 238.890625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.361882746219635, + "epoch": 1.909313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01411134314914194, + "kl": 0.023615509271621704, + "learning_rate": 3.5364384399067094e-07, + "loss": 0.0002, + "num_tokens": 49161539.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5045721530914307, + "sampling/importance_sampling_ratio/mean": 0.9995989203453064, + "sampling/importance_sampling_ratio/min": 0.5740959048271179, + "sampling/sampling_logp_difference/max": 0.5549588203430176, + "sampling/sampling_logp_difference/mean": 0.013942791149020195, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 240.46875, + "completions/mean_terminated_length": 240.46875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.5358800292015076, + "epoch": 1.9105392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8785223264113101, + "kl": 0.0358688049018383, + "learning_rate": 3.5296281529846593e-07, + "loss": -0.0083, + "num_tokens": 49207201.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6312074661254883, + "sampling/importance_sampling_ratio/mean": 0.9999327659606934, + "sampling/importance_sampling_ratio/min": 0.5513428449630737, + "sampling/sampling_logp_difference/max": 0.5953984260559082, + "sampling/sampling_logp_difference/mean": 0.01753925159573555, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 219.484375, + "completions/mean_terminated_length": 219.484375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.37377434968948364, + "epoch": 1.9117647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9434145076954223, + "kl": 0.03735412657260895, + "learning_rate": 3.5228208508243073e-07, + "loss": 0.0157, + "num_tokens": 49235648.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.667006015777588, + "sampling/importance_sampling_ratio/mean": 0.9995557069778442, + "sampling/importance_sampling_ratio/min": 0.6204636693000793, + "sampling/sampling_logp_difference/max": 0.5110292434692383, + "sampling/sampling_logp_difference/mean": 0.01359252817928791, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 239.734375, + "completions/mean_terminated_length": 239.734375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.32890066504478455, + "epoch": 1.9129901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1616256608714812, + "kl": 0.03831387311220169, + "learning_rate": 3.5160165472440467e-07, + "loss": -0.0073, + "num_tokens": 49268959.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.3948997259140015, + "sampling/importance_sampling_ratio/mean": 0.9996709823608398, + "sampling/importance_sampling_ratio/min": 0.6452450156211853, + "sampling/sampling_logp_difference/max": 0.4381251335144043, + "sampling/sampling_logp_difference/mean": 0.012082546949386597, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 202.125, + "completions/mean_terminated_length": 202.125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.40179872512817383, + "epoch": 1.9142156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02303797620106814, + "kl": 0.03159317001700401, + "learning_rate": 3.509215256056183e-07, + "loss": 0.0003, + "num_tokens": 49299719.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3907690048217773, + "sampling/importance_sampling_ratio/mean": 1.000664234161377, + "sampling/importance_sampling_ratio/min": 0.6130973696708679, + "sampling/sampling_logp_difference/max": 0.48923158645629883, + "sampling/sampling_logp_difference/mean": 0.014336859807372093, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 238.03125, + "completions/mean_terminated_length": 238.03125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3968772292137146, + "epoch": 1.9154411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3002040600877474, + "kl": 0.02640102617442608, + "learning_rate": 3.502416991066904e-07, + "loss": -0.0395, + "num_tokens": 49331577.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5946547985076904, + "sampling/importance_sampling_ratio/mean": 0.9998785853385925, + "sampling/importance_sampling_ratio/min": 0.6462288498878479, + "sampling/sampling_logp_difference/max": 0.46665728092193604, + "sampling/sampling_logp_difference/mean": 0.013803413137793541, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 243.5, + "completions/mean_terminated_length": 243.5, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3707587718963623, + "epoch": 1.9166666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9164581318319811, + "kl": 0.023630518466234207, + "learning_rate": 3.495621766076259e-07, + "loss": 0.0504, + "num_tokens": 49365529.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5934795141220093, + "sampling/importance_sampling_ratio/mean": 0.9997700452804565, + "sampling/importance_sampling_ratio/min": 0.6277029514312744, + "sampling/sampling_logp_difference/max": 0.46591997146606445, + "sampling/sampling_logp_difference/mean": 0.014482099562883377, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 205.46875, + "completions/mean_terminated_length": 205.46875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.37905019521713257, + "epoch": 1.9178921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037330464795994424, + "kl": 0.07470554113388062, + "learning_rate": 3.488829594878123e-07, + "loss": 0.0006, + "num_tokens": 49396087.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.656383752822876, + "sampling/importance_sampling_ratio/mean": 0.9995221495628357, + "sampling/importance_sampling_ratio/min": 0.6877167820930481, + "sampling/sampling_logp_difference/max": 0.5046367645263672, + "sampling/sampling_logp_difference/mean": 0.014606459997594357, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 214.5625, + "completions/mean_terminated_length": 214.5625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4915218949317932, + "epoch": 1.9191176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.089900933626398, + "kl": 0.049614038318395615, + "learning_rate": 3.4820404912601757e-07, + "loss": 0.0057, + "num_tokens": 49431371.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5227725505828857, + "sampling/importance_sampling_ratio/mean": 1.0006821155548096, + "sampling/importance_sampling_ratio/min": 0.6858685612678528, + "sampling/sampling_logp_difference/max": 0.4205327033996582, + "sampling/sampling_logp_difference/mean": 0.017097633332014084, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 192.4375, + "completions/mean_terminated_length": 192.4375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.34200167655944824, + "epoch": 1.920343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016345475038858755, + "kl": 0.029332738369703293, + "learning_rate": 3.4752544690038643e-07, + "loss": 0.0003, + "num_tokens": 49460183.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6541374921798706, + "sampling/importance_sampling_ratio/mean": 0.9999823570251465, + "sampling/importance_sampling_ratio/min": 0.6625405550003052, + "sampling/sampling_logp_difference/max": 0.5032796859741211, + "sampling/sampling_logp_difference/mean": 0.013198770582675934, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.0, + "completions/max_terminated_length": 641.0, + "completions/mean_length": 224.46875, + "completions/mean_terminated_length": 224.46875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.5203051567077637, + "epoch": 1.9215686274509802, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3166060363514769, + "kl": 0.06350134313106537, + "learning_rate": 3.468471541884385e-07, + "loss": -0.0294, + "num_tokens": 49489317.0, + "reward": -0.125, + "reward_std": 0.7023203372955322, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.3266730308532715, + "sampling/importance_sampling_ratio/mean": 0.9998144507408142, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.017328787595033646, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 246.65625, + "completions/mean_terminated_length": 246.65625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.4162110984325409, + "epoch": 1.9227941176470589, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.673909289737263, + "kl": 0.03644551709294319, + "learning_rate": 3.461691723670651e-07, + "loss": 0.0134, + "num_tokens": 49521119.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4777597188949585, + "sampling/importance_sampling_ratio/mean": 1.0004178285598755, + "sampling/importance_sampling_ratio/min": 0.6392900943756104, + "sampling/sampling_logp_difference/max": 0.44739699363708496, + "sampling/sampling_logp_difference/mean": 0.014774038456380367, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 170.734375, + "completions/mean_terminated_length": 170.734375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3501443862915039, + "epoch": 1.9240196078431373, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020873349081082473, + "kl": 0.028426751494407654, + "learning_rate": 3.454915028125263e-07, + "loss": 0.0003, + "num_tokens": 49549694.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4397555589675903, + "sampling/importance_sampling_ratio/mean": 0.9996906518936157, + "sampling/importance_sampling_ratio/min": 0.6401638984680176, + "sampling/sampling_logp_difference/max": 0.4460310935974121, + "sampling/sampling_logp_difference/mean": 0.014869427308440208, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 203.515625, + "completions/mean_terminated_length": 203.515625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.46519067883491516, + "epoch": 1.9252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019422375818408243, + "kl": 0.030188778415322304, + "learning_rate": 3.4481414690044836e-07, + "loss": 0.0003, + "num_tokens": 49583199.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.432383418083191, + "sampling/importance_sampling_ratio/mean": 1.0000834465026855, + "sampling/importance_sampling_ratio/min": 0.6342064738273621, + "sampling/sampling_logp_difference/max": 0.4553806781768799, + "sampling/sampling_logp_difference/mean": 0.017122812569141388, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 214.484375, + "completions/mean_terminated_length": 214.484375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3726535737514496, + "epoch": 1.9264705882352942, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.123537651624613, + "kl": 0.026137355715036392, + "learning_rate": 3.441371060058209e-07, + "loss": 0.012, + "num_tokens": 49615406.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.431039810180664, + "sampling/importance_sampling_ratio/mean": 0.9996926784515381, + "sampling/importance_sampling_ratio/min": 0.7300207614898682, + "sampling/sampling_logp_difference/max": 0.3584012985229492, + "sampling/sampling_logp_difference/mean": 0.01359387207776308, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 178.25, + "completions/mean_terminated_length": 178.25, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.37289541959762573, + "epoch": 1.9276960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019952024297273645, + "kl": 0.030565187335014343, + "learning_rate": 3.4346038150299425e-07, + "loss": 0.0003, + "num_tokens": 49639806.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4249707460403442, + "sampling/importance_sampling_ratio/mean": 0.9998721480369568, + "sampling/importance_sampling_ratio/min": 0.607313334941864, + "sampling/sampling_logp_difference/max": 0.49871039390563965, + "sampling/sampling_logp_difference/mean": 0.015014609321951866, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 187.71875, + "completions/mean_terminated_length": 187.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.39289116859436035, + "epoch": 1.928921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015109079161216534, + "kl": 0.02577681839466095, + "learning_rate": 3.427839747656758e-07, + "loss": 0.0003, + "num_tokens": 49671436.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4461145401000977, + "sampling/importance_sampling_ratio/mean": 1.0000542402267456, + "sampling/importance_sampling_ratio/min": 0.6255195140838623, + "sampling/sampling_logp_difference/max": 0.46917271614074707, + "sampling/sampling_logp_difference/mean": 0.014366412535309792, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 178.265625, + "completions/mean_terminated_length": 178.265625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.389309823513031, + "epoch": 1.9301470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9804361746626181, + "kl": 0.04585760459303856, + "learning_rate": 3.4210788716692875e-07, + "loss": -0.005, + "num_tokens": 49698685.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.3113781213760376, + "sampling/importance_sampling_ratio/mean": 1.000306487083435, + "sampling/importance_sampling_ratio/min": 0.6482202410697937, + "sampling/sampling_logp_difference/max": 0.4335247278213501, + "sampling/sampling_logp_difference/mean": 0.013438409194350243, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 198.78125, + "completions/mean_terminated_length": 198.78125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.4965009391307831, + "epoch": 1.9313725490196079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1148114534308606, + "kl": 0.031996533274650574, + "learning_rate": 3.414321200791679e-07, + "loss": -0.0272, + "num_tokens": 49735903.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5280892848968506, + "sampling/importance_sampling_ratio/mean": 1.0006356239318848, + "sampling/importance_sampling_ratio/min": 0.7082610726356506, + "sampling/sampling_logp_difference/max": 0.42401814460754395, + "sampling/sampling_logp_difference/mean": 0.017043106257915497, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 182.671875, + "completions/mean_terminated_length": 182.671875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.45260685682296753, + "epoch": 1.9325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023600819454342645, + "kl": 0.03608972579240799, + "learning_rate": 3.4075667487415785e-07, + "loss": 0.0004, + "num_tokens": 49768426.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6213918924331665, + "sampling/importance_sampling_ratio/mean": 1.0005576610565186, + "sampling/importance_sampling_ratio/min": 0.6077486276626587, + "sampling/sampling_logp_difference/max": 0.49799394607543945, + "sampling/sampling_logp_difference/mean": 0.01709265448153019, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 210.421875, + "completions/mean_terminated_length": 210.421875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.41075772047042847, + "epoch": 1.9338235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7296178305784208, + "kl": 0.058899056166410446, + "learning_rate": 3.4008155292300934e-07, + "loss": -0.0186, + "num_tokens": 49796725.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.326687216758728, + "sampling/importance_sampling_ratio/mean": 1.000306248664856, + "sampling/importance_sampling_ratio/min": 0.6764224171638489, + "sampling/sampling_logp_difference/max": 0.39093756675720215, + "sampling/sampling_logp_difference/mean": 0.016160249710083008, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 213.765625, + "completions/mean_terminated_length": 213.765625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.37981435656547546, + "epoch": 1.9350490196078431, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7461768884595513, + "kl": 0.05328400433063507, + "learning_rate": 3.3940675559617723e-07, + "loss": 0.0033, + "num_tokens": 49832646.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6485239267349243, + "sampling/importance_sampling_ratio/mean": 0.9997677803039551, + "sampling/importance_sampling_ratio/min": 0.6063785552978516, + "sampling/sampling_logp_difference/max": 0.5002508163452148, + "sampling/sampling_logp_difference/mean": 0.014991349540650845, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 184.625, + "completions/mean_terminated_length": 184.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4077078104019165, + "epoch": 1.9362745098039216, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021226211046230563, + "kl": 0.03923317417502403, + "learning_rate": 3.3873228426345757e-07, + "loss": 0.0004, + "num_tokens": 49857310.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3727229833602905, + "sampling/importance_sampling_ratio/mean": 0.9997315406799316, + "sampling/importance_sampling_ratio/min": 0.6232230067253113, + "sampling/sampling_logp_difference/max": 0.4728507995605469, + "sampling/sampling_logp_difference/mean": 0.01577834039926529, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 174.546875, + "completions/mean_terminated_length": 174.546875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4120787978172302, + "epoch": 1.9375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10868653636032384, + "kl": 0.09270511567592621, + "learning_rate": 3.380581402939841e-07, + "loss": 0.0007, + "num_tokens": 49882097.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5144963264465332, + "sampling/importance_sampling_ratio/mean": 0.9998019933700562, + "sampling/importance_sampling_ratio/min": 0.6171509027481079, + "sampling/sampling_logp_difference/max": 0.48264169692993164, + "sampling/sampling_logp_difference/mean": 0.01620662957429886, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 219.890625, + "completions/mean_terminated_length": 219.890625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4209383726119995, + "epoch": 1.9387254901960784, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1926904175844644, + "kl": 0.032342009246349335, + "learning_rate": 3.373843250562265e-07, + "loss": 0.0034, + "num_tokens": 49917338.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4694732427597046, + "sampling/importance_sampling_ratio/mean": 0.9998754262924194, + "sampling/importance_sampling_ratio/min": 0.6038687825202942, + "sampling/sampling_logp_difference/max": 0.5043983459472656, + "sampling/sampling_logp_difference/mean": 0.015703190118074417, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 186.390625, + "completions/mean_terminated_length": 186.390625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3987663984298706, + "epoch": 1.9399509803921569, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023364620329624858, + "kl": 0.06759518384933472, + "learning_rate": 3.3671083991798697e-07, + "loss": 0.0006, + "num_tokens": 49944627.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.444628119468689, + "sampling/importance_sampling_ratio/mean": 0.9996543526649475, + "sampling/importance_sampling_ratio/min": 0.7106347680091858, + "sampling/sampling_logp_difference/max": 0.36785197257995605, + "sampling/sampling_logp_difference/mean": 0.015497813001275063, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 214.234375, + "completions/mean_terminated_length": 214.234375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3201386332511902, + "epoch": 1.9411764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028574620573220313, + "kl": 0.024369578808546066, + "learning_rate": 3.360376862463978e-07, + "loss": 0.0002, + "num_tokens": 49972450.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5025506019592285, + "sampling/importance_sampling_ratio/mean": 0.9998648166656494, + "sampling/importance_sampling_ratio/min": 0.2960188686847687, + "sampling/sampling_logp_difference/max": 1.217332124710083, + "sampling/sampling_logp_difference/mean": 0.013632440008223057, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 187.03125, + "completions/mean_terminated_length": 187.03125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3416188657283783, + "epoch": 1.9424019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0162594117210751, + "kl": 0.025658920407295227, + "learning_rate": 3.3536486540791823e-07, + "loss": 0.0003, + "num_tokens": 50000116.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.54694402217865, + "sampling/importance_sampling_ratio/mean": 1.0000746250152588, + "sampling/importance_sampling_ratio/min": 0.7320797443389893, + "sampling/sampling_logp_difference/max": 0.4362814426422119, + "sampling/sampling_logp_difference/mean": 0.013598126359283924, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 218.421875, + "completions/mean_terminated_length": 218.421875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3883388638496399, + "epoch": 1.9436274509803921, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8629936895714069, + "kl": 0.031464897096157074, + "learning_rate": 3.3469237876833187e-07, + "loss": -0.0288, + "num_tokens": 50035775.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.799018383026123, + "sampling/importance_sampling_ratio/mean": 0.9999352097511292, + "sampling/importance_sampling_ratio/min": 0.6191887855529785, + "sampling/sampling_logp_difference/max": 0.5872411727905273, + "sampling/sampling_logp_difference/mean": 0.014580667950212955, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 174.375, + "completions/mean_terminated_length": 174.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.33257895708084106, + "epoch": 1.9448529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021657433703474765, + "kl": 0.023327291011810303, + "learning_rate": 3.340202276927442e-07, + "loss": 0.0002, + "num_tokens": 50066983.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.430035948753357, + "sampling/importance_sampling_ratio/mean": 1.0003684759140015, + "sampling/importance_sampling_ratio/min": 0.6523290276527405, + "sampling/sampling_logp_difference/max": 0.42720627784729004, + "sampling/sampling_logp_difference/mean": 0.014051834121346474, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 308.71875, + "completions/mean_terminated_length": 308.71875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4223977029323578, + "epoch": 1.946078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5606014676586772, + "kl": 0.05478545278310776, + "learning_rate": 3.333484135455792e-07, + "loss": -0.0113, + "num_tokens": 50106853.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5471713542938232, + "sampling/importance_sampling_ratio/mean": 1.0003740787506104, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.014794318936765194, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 200.0625, + "completions/mean_terminated_length": 200.0625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.4382716417312622, + "epoch": 1.9473039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019181652014097182, + "kl": 0.030908845365047455, + "learning_rate": 3.326769376905769e-07, + "loss": 0.0003, + "num_tokens": 50153417.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6531814336776733, + "sampling/importance_sampling_ratio/mean": 1.0004072189331055, + "sampling/importance_sampling_ratio/min": 0.7394487857818604, + "sampling/sampling_logp_difference/max": 0.5027015209197998, + "sampling/sampling_logp_difference/mean": 0.016115907579660416, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 290.40625, + "completions/mean_terminated_length": 290.40625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.3952628970146179, + "epoch": 1.9485294117647058, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6131906149656822, + "kl": 0.036005325615406036, + "learning_rate": 3.3200580149079083e-07, + "loss": -0.0234, + "num_tokens": 50195763.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 0.999909520149231, + "sampling/importance_sampling_ratio/min": 0.7331599593162537, + "sampling/sampling_logp_difference/max": 0.48689985275268555, + "sampling/sampling_logp_difference/mean": 0.01339157484471798, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 210.515625, + "completions/mean_terminated_length": 210.515625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.4929760694503784, + "epoch": 1.9497549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1510936162821228, + "kl": 0.040675677359104156, + "learning_rate": 3.31335006308585e-07, + "loss": -0.0104, + "num_tokens": 50224868.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.4317108392715454, + "sampling/importance_sampling_ratio/mean": 0.9994465112686157, + "sampling/importance_sampling_ratio/min": 0.6771494150161743, + "sampling/sampling_logp_difference/max": 0.3898632526397705, + "sampling/sampling_logp_difference/mean": 0.016803476959466934, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 220.375, + "completions/mean_terminated_length": 220.375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.43143031001091003, + "epoch": 1.9509803921568627, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1763649983494817, + "kl": 0.06657734513282776, + "learning_rate": 3.3066455350563115e-07, + "loss": -0.0035, + "num_tokens": 50254300.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.3829240798950195, + "sampling/importance_sampling_ratio/mean": 0.9998020529747009, + "sampling/importance_sampling_ratio/min": 0.6788738369941711, + "sampling/sampling_logp_difference/max": 0.3873199224472046, + "sampling/sampling_logp_difference/mean": 0.015550259500741959, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 221.65625, + "completions/mean_terminated_length": 221.65625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4554600119590759, + "epoch": 1.9522058823529411, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0336284950443926, + "kl": 0.050704628229141235, + "learning_rate": 3.29994444442906e-07, + "loss": -0.0701, + "num_tokens": 50286406.0, + "reward": 0.375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6007791757583618, + "sampling/importance_sampling_ratio/mean": 0.9997819066047668, + "sampling/importance_sampling_ratio/min": 0.6097137331962585, + "sampling/sampling_logp_difference/max": 0.4947657585144043, + "sampling/sampling_logp_difference/mean": 0.016511352732777596, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 263.359375, + "completions/mean_terminated_length": 263.359375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.513678252696991, + "epoch": 1.9534313725490198, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.2700148200836998, + "kl": 0.06838428229093552, + "learning_rate": 3.2932468048068836e-07, + "loss": 0.0475, + "num_tokens": 50322765.0, + "reward": 0.4375, + "reward_std": 0.6267197132110596, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4775911569595337, + "sampling/importance_sampling_ratio/mean": 0.9996185898780823, + "sampling/importance_sampling_ratio/min": 0.6207948327064514, + "sampling/sampling_logp_difference/max": 0.47675466537475586, + "sampling/sampling_logp_difference/mean": 0.01690804958343506, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 248.203125, + "completions/mean_terminated_length": 248.203125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.2967587113380432, + "epoch": 1.954656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7413381994218197, + "kl": 0.03382870927453041, + "learning_rate": 3.2865526297855694e-07, + "loss": 0.0012, + "num_tokens": 50359994.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4068033695220947, + "sampling/importance_sampling_ratio/mean": 1.000199317932129, + "sampling/importance_sampling_ratio/min": 0.6435267925262451, + "sampling/sampling_logp_difference/max": 0.44079160690307617, + "sampling/sampling_logp_difference/mean": 0.011207588016986847, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 194.9375, + "completions/mean_terminated_length": 194.9375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.41791579127311707, + "epoch": 1.9558823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9452934538935908, + "kl": 0.03728911280632019, + "learning_rate": 3.2798619329538646e-07, + "loss": 0.0215, + "num_tokens": 50388630.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.3477599620819092, + "sampling/importance_sampling_ratio/mean": 1.0002269744873047, + "sampling/importance_sampling_ratio/min": 0.6202828288078308, + "sampling/sampling_logp_difference/max": 0.47757983207702637, + "sampling/sampling_logp_difference/mean": 0.016275346279144287, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 233.484375, + "completions/mean_terminated_length": 233.484375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.5423104763031006, + "epoch": 1.9571078431372548, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3498248195388074, + "kl": 0.062157321721315384, + "learning_rate": 3.2731747278934623e-07, + "loss": -0.0171, + "num_tokens": 50423957.0, + "reward": 0.375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5467382669448853, + "sampling/importance_sampling_ratio/mean": 0.9999659061431885, + "sampling/importance_sampling_ratio/min": 0.6392384171485901, + "sampling/sampling_logp_difference/max": 0.4474778175354004, + "sampling/sampling_logp_difference/mean": 0.01773090660572052, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 247.96875, + "completions/mean_terminated_length": 247.96875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.40003079175949097, + "epoch": 1.9583333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.689370522366843, + "kl": 0.034257661551237106, + "learning_rate": 3.266491028178964e-07, + "loss": 0.0228, + "num_tokens": 50458819.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.549085021018982, + "sampling/importance_sampling_ratio/mean": 1.0001552104949951, + "sampling/importance_sampling_ratio/min": 0.6638208627700806, + "sampling/sampling_logp_difference/max": 0.4376645088195801, + "sampling/sampling_logp_difference/mean": 0.01411025132983923, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 218.484375, + "completions/mean_terminated_length": 218.484375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3510703444480896, + "epoch": 1.9595588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011985749171011403, + "kl": 0.020307883620262146, + "learning_rate": 3.2598108473778595e-07, + "loss": 0.0002, + "num_tokens": 50490194.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4143846035003662, + "sampling/importance_sampling_ratio/mean": 1.0001227855682373, + "sampling/importance_sampling_ratio/min": 0.6795847415924072, + "sampling/sampling_logp_difference/max": 0.3862733840942383, + "sampling/sampling_logp_difference/mean": 0.01346974354237318, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 236.0, + "completions/mean_terminated_length": 236.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.4613341987133026, + "epoch": 1.9607843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7638189527870977, + "kl": 0.05117883160710335, + "learning_rate": 3.253134199050489e-07, + "loss": 0.0099, + "num_tokens": 50523026.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.527489423751831, + "sampling/importance_sampling_ratio/mean": 0.9998858571052551, + "sampling/importance_sampling_ratio/min": 0.6256482601165771, + "sampling/sampling_logp_difference/max": 0.4689669609069824, + "sampling/sampling_logp_difference/mean": 0.015857355669140816, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 239.0, + "completions/mean_terminated_length": 239.0, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.40186768770217896, + "epoch": 1.9620098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015758176314101594, + "kl": 0.02447192184627056, + "learning_rate": 3.2464610967500273e-07, + "loss": 0.0002, + "num_tokens": 50558434.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6482127904891968, + "sampling/importance_sampling_ratio/mean": 1.000008225440979, + "sampling/importance_sampling_ratio/min": 0.6263538002967834, + "sampling/sampling_logp_difference/max": 0.4996914863586426, + "sampling/sampling_logp_difference/mean": 0.014021730050444603, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 261.59375, + "completions/mean_terminated_length": 261.59375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.3546887934207916, + "epoch": 1.9632352941176472, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7056207683313288, + "kl": 0.04179853945970535, + "learning_rate": 3.239791554022449e-07, + "loss": 0.0169, + "num_tokens": 50593960.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.441324234008789, + "sampling/importance_sampling_ratio/mean": 1.000085473060608, + "sampling/importance_sampling_ratio/min": 0.6648438572883606, + "sampling/sampling_logp_difference/max": 0.408203125, + "sampling/sampling_logp_difference/mean": 0.012519138865172863, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 227.9375, + "completions/mean_terminated_length": 227.9375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3909626603126526, + "epoch": 1.9644607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01686007611299536, + "kl": 0.02554922364652157, + "learning_rate": 3.233125584406505e-07, + "loss": 0.0002, + "num_tokens": 50627668.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5972938537597656, + "sampling/importance_sampling_ratio/mean": 0.9998860359191895, + "sampling/importance_sampling_ratio/min": 0.6601378917694092, + "sampling/sampling_logp_difference/max": 0.4683108329772949, + "sampling/sampling_logp_difference/mean": 0.014788507483899593, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 214.71875, + "completions/mean_terminated_length": 214.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.34474942088127136, + "epoch": 1.965686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015010132983517764, + "kl": 0.025028621777892113, + "learning_rate": 3.226463201433688e-07, + "loss": 0.0002, + "num_tokens": 50661570.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995816349983215, + "sampling/importance_sampling_ratio/min": 0.5260617733001709, + "sampling/sampling_logp_difference/max": 0.9235885143280029, + "sampling/sampling_logp_difference/mean": 0.013952052220702171, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 218.21875, + "completions/mean_terminated_length": 218.21875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.4414424002170563, + "epoch": 1.9669117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018974998262863217, + "kl": 0.03201860934495926, + "learning_rate": 3.219804418628216e-07, + "loss": 0.0003, + "num_tokens": 50695936.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3888485431671143, + "sampling/importance_sampling_ratio/mean": 1.0003587007522583, + "sampling/importance_sampling_ratio/min": 0.755850613117218, + "sampling/sampling_logp_difference/max": 0.3284749984741211, + "sampling/sampling_logp_difference/mean": 0.01593206077814102, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 196.203125, + "completions/mean_terminated_length": 196.203125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4284861385822296, + "epoch": 1.968137254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.290646620478447, + "kl": 0.03602099046111107, + "learning_rate": 3.2131492495069965e-07, + "loss": -0.0239, + "num_tokens": 50729741.0, + "reward": -0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.396368145942688, + "sampling/importance_sampling_ratio/mean": 1.0005464553833008, + "sampling/importance_sampling_ratio/min": 0.7253676652908325, + "sampling/sampling_logp_difference/max": 0.3338747024536133, + "sampling/sampling_logp_difference/mean": 0.01478176936507225, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 252.390625, + "completions/mean_terminated_length": 252.390625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.39664268493652344, + "epoch": 1.969362745098039, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0307934191508574, + "kl": 0.025808095932006836, + "learning_rate": 3.206497707579598e-07, + "loss": -0.0956, + "num_tokens": 50765318.0, + "reward": 0.53125, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3490233421325684, + "sampling/importance_sampling_ratio/mean": 0.9996511936187744, + "sampling/importance_sampling_ratio/min": 0.7528896927833557, + "sampling/sampling_logp_difference/max": 0.2993807792663574, + "sampling/sampling_logp_difference/mean": 0.013657033443450928, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 180.984375, + "completions/mean_terminated_length": 180.984375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.37230825424194336, + "epoch": 1.9705882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017144080067726417, + "kl": 0.03063638135790825, + "learning_rate": 3.199849806348233e-07, + "loss": 0.0003, + "num_tokens": 50795701.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5719265937805176, + "sampling/importance_sampling_ratio/mean": 0.9995201826095581, + "sampling/importance_sampling_ratio/min": 0.645332932472229, + "sampling/sampling_logp_difference/max": 0.4523019790649414, + "sampling/sampling_logp_difference/mean": 0.014999780803918839, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 168.40625, + "completions/mean_terminated_length": 168.40625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4284522831439972, + "epoch": 1.971813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9406603697428186, + "kl": 0.03776994347572327, + "learning_rate": 3.1932055593077166e-07, + "loss": 0.0168, + "num_tokens": 50821407.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4775913953781128, + "sampling/importance_sampling_ratio/mean": 1.0000436305999756, + "sampling/importance_sampling_ratio/min": 0.6637405157089233, + "sampling/sampling_logp_difference/max": 0.4098639488220215, + "sampling/sampling_logp_difference/mean": 0.016887390986084938, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 201.15625, + "completions/mean_terminated_length": 201.15625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.44030991196632385, + "epoch": 1.9730392156862746, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.346565925620663, + "kl": 0.044444017112255096, + "learning_rate": 3.186564979945453e-07, + "loss": -0.024, + "num_tokens": 50853561.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7065656185150146, + "sampling/importance_sampling_ratio/mean": 1.0007504224777222, + "sampling/importance_sampling_ratio/min": 0.6331958770751953, + "sampling/sampling_logp_difference/max": 0.5344829559326172, + "sampling/sampling_logp_difference/mean": 0.016215015202760696, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 241.515625, + "completions/mean_terminated_length": 241.515625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3774697482585907, + "epoch": 1.9742647058823528, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0093232935662737, + "kl": 0.0236565750092268, + "learning_rate": 3.179928081741394e-07, + "loss": -0.0086, + "num_tokens": 50895850.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3670462369918823, + "sampling/importance_sampling_ratio/mean": 1.000116229057312, + "sampling/importance_sampling_ratio/min": 0.604745626449585, + "sampling/sampling_logp_difference/max": 0.5029473304748535, + "sampling/sampling_logp_difference/mean": 0.013629928231239319, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 220.53125, + "completions/mean_terminated_length": 220.53125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.46636515855789185, + "epoch": 1.9754901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7741735253856575, + "kl": 0.033601656556129456, + "learning_rate": 3.173294878168025e-07, + "loss": -0.0036, + "num_tokens": 50926812.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5776169300079346, + "sampling/importance_sampling_ratio/mean": 0.9993590712547302, + "sampling/importance_sampling_ratio/min": 0.6298578381538391, + "sampling/sampling_logp_difference/max": 0.4622611999511719, + "sampling/sampling_logp_difference/mean": 0.016215326264500618, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 203.0, + "completions/mean_terminated_length": 203.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4220837354660034, + "epoch": 1.9767156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03491036981233961, + "kl": 0.03890404850244522, + "learning_rate": 3.166665382690327e-07, + "loss": 0.0004, + "num_tokens": 50958988.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6628754138946533, + "sampling/importance_sampling_ratio/mean": 0.9996306896209717, + "sampling/importance_sampling_ratio/min": 0.3209123909473419, + "sampling/sampling_logp_difference/max": 1.1365870237350464, + "sampling/sampling_logp_difference/mean": 0.016190893948078156, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 211.5625, + "completions/mean_terminated_length": 211.5625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3358461558818817, + "epoch": 1.9779411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014874952855669052, + "kl": 0.023703187704086304, + "learning_rate": 3.1600396087657586e-07, + "loss": 0.0002, + "num_tokens": 50988080.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2980740070343018, + "sampling/importance_sampling_ratio/mean": 0.9990770220756531, + "sampling/importance_sampling_ratio/min": 0.661005973815918, + "sampling/sampling_logp_difference/max": 0.41399240493774414, + "sampling/sampling_logp_difference/mean": 0.013611245900392532, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 223.515625, + "completions/mean_terminated_length": 223.515625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.39452552795410156, + "epoch": 1.9791666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8423314298277232, + "kl": 0.03638045862317085, + "learning_rate": 3.153417569844219e-07, + "loss": -0.0026, + "num_tokens": 51024689.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5418307781219482, + "sampling/importance_sampling_ratio/mean": 0.9999921321868896, + "sampling/importance_sampling_ratio/min": 0.6482195258140564, + "sampling/sampling_logp_difference/max": 0.43352580070495605, + "sampling/sampling_logp_difference/mean": 0.0139867402613163, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 184.890625, + "completions/mean_terminated_length": 184.890625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5079770088195801, + "epoch": 1.9803921568627452, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030758759284119606, + "kl": 0.058124691247940063, + "learning_rate": 3.1467992793680267e-07, + "loss": 0.0006, + "num_tokens": 51060714.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4924474954605103, + "sampling/importance_sampling_ratio/mean": 0.9997177124023438, + "sampling/importance_sampling_ratio/min": 0.7062594890594482, + "sampling/sampling_logp_difference/max": 0.4004173278808594, + "sampling/sampling_logp_difference/mean": 0.017262417823076248, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 166.015625, + "completions/mean_terminated_length": 166.015625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.42368459701538086, + "epoch": 1.9816176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9664555440161905, + "kl": 0.04469449445605278, + "learning_rate": 3.140184750771895e-07, + "loss": 0.0118, + "num_tokens": 51088299.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.406961441040039, + "sampling/importance_sampling_ratio/mean": 0.9997423887252808, + "sampling/importance_sampling_ratio/min": 0.6152718663215637, + "sampling/sampling_logp_difference/max": 0.4856910705566406, + "sampling/sampling_logp_difference/mean": 0.015562538057565689, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 184.421875, + "completions/mean_terminated_length": 184.421875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.43028944730758667, + "epoch": 1.982843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8922780422666251, + "kl": 0.043754082173109055, + "learning_rate": 3.133573997482896e-07, + "loss": 0.0193, + "num_tokens": 51123302.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4377474784851074, + "sampling/importance_sampling_ratio/mean": 0.9996957182884216, + "sampling/importance_sampling_ratio/min": 0.6646489500999451, + "sampling/sampling_logp_difference/max": 0.40849626064300537, + "sampling/sampling_logp_difference/mean": 0.015855390578508377, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 190.28125, + "completions/mean_terminated_length": 190.28125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.416605681180954, + "epoch": 1.9840686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01691404971353703, + "kl": 0.027441244572401047, + "learning_rate": 3.1269670329204393e-07, + "loss": 0.0003, + "num_tokens": 51155560.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6105904579162598, + "sampling/importance_sampling_ratio/mean": 1.0004615783691406, + "sampling/importance_sampling_ratio/min": 0.6894065141677856, + "sampling/sampling_logp_difference/max": 0.47660088539123535, + "sampling/sampling_logp_difference/mean": 0.015048052184283733, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 159.6875, + "completions/mean_terminated_length": 159.6875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3821426033973694, + "epoch": 1.9852941176470589, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017578685537538653, + "kl": 0.03149223327636719, + "learning_rate": 3.1203638704962465e-07, + "loss": 0.0003, + "num_tokens": 51183060.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4779694080352783, + "sampling/importance_sampling_ratio/mean": 0.9998247623443604, + "sampling/importance_sampling_ratio/min": 0.662719190120697, + "sampling/sampling_logp_difference/max": 0.4114038944244385, + "sampling/sampling_logp_difference/mean": 0.014318181201815605, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 216.84375, + "completions/mean_terminated_length": 216.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.48681575059890747, + "epoch": 1.9865196078431373, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5712534163066981, + "kl": 0.06585294008255005, + "learning_rate": 3.11376452361432e-07, + "loss": -0.0004, + "num_tokens": 51211962.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6226540803909302, + "sampling/importance_sampling_ratio/mean": 0.9997466206550598, + "sampling/importance_sampling_ratio/min": 0.7015644907951355, + "sampling/sampling_logp_difference/max": 0.48406314849853516, + "sampling/sampling_logp_difference/mean": 0.017295796424150467, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 188.828125, + "completions/mean_terminated_length": 188.828125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.3920605480670929, + "epoch": 1.9877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01813063579027912, + "kl": 0.028277505189180374, + "learning_rate": 3.107169005670912e-07, + "loss": 0.0003, + "num_tokens": 51239343.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.439265489578247, + "sampling/importance_sampling_ratio/mean": 1.0000557899475098, + "sampling/importance_sampling_ratio/min": 0.4821047782897949, + "sampling/sampling_logp_difference/max": 0.7295938730239868, + "sampling/sampling_logp_difference/mean": 0.014987459406256676, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 164.34375, + "completions/mean_terminated_length": 164.34375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.48923131823539734, + "epoch": 1.9889705882352942, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0667513202962708, + "kl": 0.06547729671001434, + "learning_rate": 3.100577330054508e-07, + "loss": -0.0016, + "num_tokens": 51270677.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4297629594802856, + "sampling/importance_sampling_ratio/mean": 0.9998574256896973, + "sampling/importance_sampling_ratio/min": 0.6807543635368347, + "sampling/sampling_logp_difference/max": 0.38455379009246826, + "sampling/sampling_logp_difference/mean": 0.016909096390008926, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 195.59375, + "completions/mean_terminated_length": 195.59375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.5149567127227783, + "epoch": 1.9901960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3214360322573169, + "kl": 0.04750575125217438, + "learning_rate": 3.0939895101457914e-07, + "loss": -0.0654, + "num_tokens": 51298539.0, + "reward": 0.28125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.426176905632019, + "sampling/importance_sampling_ratio/mean": 1.0000321865081787, + "sampling/importance_sampling_ratio/min": 0.6894845962524414, + "sampling/sampling_logp_difference/max": 0.3718109130859375, + "sampling/sampling_logp_difference/mean": 0.016956638544797897, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 271.9375, + "completions/mean_terminated_length": 271.9375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.273445725440979, + "epoch": 1.991421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011491568779886541, + "kl": 0.02013445645570755, + "learning_rate": 3.087405559317622e-07, + "loss": 0.0002, + "num_tokens": 51333431.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4298361539840698, + "sampling/importance_sampling_ratio/mean": 0.9997339844703674, + "sampling/importance_sampling_ratio/min": 0.6772682666778564, + "sampling/sampling_logp_difference/max": 0.38968777656555176, + "sampling/sampling_logp_difference/mean": 0.010294242762029171, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 220.265625, + "completions/mean_terminated_length": 220.265625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.4093412458896637, + "epoch": 1.9926470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7499495984707397, + "kl": 0.027860896661877632, + "learning_rate": 3.0808254909349986e-07, + "loss": 0.007, + "num_tokens": 51364552.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3708269596099854, + "sampling/importance_sampling_ratio/mean": 0.9997884631156921, + "sampling/importance_sampling_ratio/min": 0.6372929811477661, + "sampling/sampling_logp_difference/max": 0.45052576065063477, + "sampling/sampling_logp_difference/mean": 0.013881012797355652, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 228.546875, + "completions/mean_terminated_length": 228.546875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.43547090888023376, + "epoch": 1.9938725490196079, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01597495877420292, + "kl": 0.030936989933252335, + "learning_rate": 3.0742493183550454e-07, + "loss": 0.0003, + "num_tokens": 51401035.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2765107154846191, + "sampling/importance_sampling_ratio/mean": 1.000434398651123, + "sampling/importance_sampling_ratio/min": 0.4848077595233917, + "sampling/sampling_logp_difference/max": 0.7240028381347656, + "sampling/sampling_logp_difference/mean": 0.01582859829068184, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 656.0, + "completions/max_terminated_length": 656.0, + "completions/mean_length": 260.921875, + "completions/mean_terminated_length": 260.921875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.5362026691436768, + "epoch": 1.9950980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8660097690633888, + "kl": 0.061552420258522034, + "learning_rate": 3.0676770549269786e-07, + "loss": -0.0167, + "num_tokens": 51438982.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.423123836517334, + "sampling/importance_sampling_ratio/mean": 0.9998515248298645, + "sampling/importance_sampling_ratio/min": 0.5836427211761475, + "sampling/sampling_logp_difference/max": 0.538466215133667, + "sampling/sampling_logp_difference/mean": 0.01750752329826355, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 189.15625, + "completions/mean_terminated_length": 189.15625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.44771260023117065, + "epoch": 1.9963235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017847464088607986, + "kl": 0.030776165425777435, + "learning_rate": 3.0611087139920717e-07, + "loss": 0.0003, + "num_tokens": 51467808.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4113706350326538, + "sampling/importance_sampling_ratio/mean": 1.0004737377166748, + "sampling/importance_sampling_ratio/min": 0.6768690943717957, + "sampling/sampling_logp_difference/max": 0.3902773857116699, + "sampling/sampling_logp_difference/mean": 0.01701674610376358, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 186.859375, + "completions/mean_terminated_length": 186.859375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.35539400577545166, + "epoch": 1.9975490196078431, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015409080874464456, + "kl": 0.025421861559152603, + "learning_rate": 3.054544308883643e-07, + "loss": 0.0002, + "num_tokens": 51499031.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.459579348564148, + "sampling/importance_sampling_ratio/mean": 1.0004053115844727, + "sampling/importance_sampling_ratio/min": 0.6269775629043579, + "sampling/sampling_logp_difference/max": 0.4668445587158203, + "sampling/sampling_logp_difference/mean": 0.012880798429250717, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 207.546875, + "completions/mean_terminated_length": 207.546875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4121703505516052, + "epoch": 1.9987745098039216, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8926295743082454, + "kl": 0.02864101715385914, + "learning_rate": 3.0479838529270186e-07, + "loss": 0.0312, + "num_tokens": 51529402.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.9235390424728394, + "sampling/importance_sampling_ratio/mean": 1.000265121459961, + "sampling/importance_sampling_ratio/min": 0.5656784772872925, + "sampling/sampling_logp_difference/max": 0.6541666984558105, + "sampling/sampling_logp_difference/mean": 0.014927449636161327, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 206.34375, + "completions/mean_terminated_length": 206.34375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.4023796021938324, + "epoch": 2.0, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7346457549800537, + "kl": 0.03533341735601425, + "learning_rate": 3.0414273594395103e-07, + "loss": 0.0126, + "num_tokens": 51561392.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4619437456130981, + "sampling/importance_sampling_ratio/mean": 1.0000967979431152, + "sampling/importance_sampling_ratio/min": 0.779944658279419, + "sampling/sampling_logp_difference/max": 0.37976694107055664, + "sampling/sampling_logp_difference/mean": 0.014451739378273487, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 226.828125, + "completions/mean_terminated_length": 226.828125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.4342731237411499, + "epoch": 2.0012254901960786, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7353459988915481, + "kl": 0.03901055082678795, + "learning_rate": 3.034874841730382e-07, + "loss": -0.0565, + "num_tokens": 51597749.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.47286856174469, + "sampling/importance_sampling_ratio/mean": 0.9998716711997986, + "sampling/importance_sampling_ratio/min": 0.6752111911773682, + "sampling/sampling_logp_difference/max": 0.3927297592163086, + "sampling/sampling_logp_difference/mean": 0.014761561527848244, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 183.078125, + "completions/mean_terminated_length": 183.078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.48680412769317627, + "epoch": 2.002450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1911633868036358, + "kl": 0.07100534439086914, + "learning_rate": 3.0283263131008307e-07, + "loss": -0.0056, + "num_tokens": 51627562.0, + "reward": 0.65625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4876328706741333, + "sampling/importance_sampling_ratio/mean": 0.9998986124992371, + "sampling/importance_sampling_ratio/min": 0.192392960190773, + "sampling/sampling_logp_difference/max": 1.6482152938842773, + "sampling/sampling_logp_difference/mean": 0.01730552315711975, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 187.890625, + "completions/mean_terminated_length": 187.890625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.47430655360221863, + "epoch": 2.0036764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021617528998024542, + "kl": 0.040886953473091125, + "learning_rate": 3.0217817868439545e-07, + "loss": 0.0004, + "num_tokens": 51654211.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4650894403457642, + "sampling/importance_sampling_ratio/mean": 1.0000306367874146, + "sampling/importance_sampling_ratio/min": 0.7136863470077515, + "sampling/sampling_logp_difference/max": 0.3819162845611572, + "sampling/sampling_logp_difference/mean": 0.01563193090260029, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 183.796875, + "completions/mean_terminated_length": 183.796875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.37739595770835876, + "epoch": 2.0049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01592482339486375, + "kl": 0.028131265193223953, + "learning_rate": 3.015241276244729e-07, + "loss": 0.0003, + "num_tokens": 51683190.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.44826340675354, + "sampling/importance_sampling_ratio/mean": 0.9998368620872498, + "sampling/importance_sampling_ratio/min": 0.6941367387771606, + "sampling/sampling_logp_difference/max": 0.3703651428222656, + "sampling/sampling_logp_difference/mean": 0.01402178406715393, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 213.1875, + "completions/mean_terminated_length": 213.1875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.34953129291534424, + "epoch": 2.0061274509803924, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01891633922524575, + "kl": 0.02821243926882744, + "learning_rate": 3.0087047945799724e-07, + "loss": 0.0003, + "num_tokens": 51711378.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3256299495697021, + "sampling/importance_sampling_ratio/mean": 0.9995253682136536, + "sampling/importance_sampling_ratio/min": 0.6327881217002869, + "sampling/sampling_logp_difference/max": 0.45761966705322266, + "sampling/sampling_logp_difference/mean": 0.014304942451417446, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 218.6875, + "completions/mean_terminated_length": 218.6875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.37606731057167053, + "epoch": 2.0073529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2220779175234928, + "kl": 0.0383443757891655, + "learning_rate": 3.002172355118331e-07, + "loss": 0.0208, + "num_tokens": 51745150.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6290236711502075, + "sampling/importance_sampling_ratio/mean": 0.9998230934143066, + "sampling/importance_sampling_ratio/min": 0.6105685234069824, + "sampling/sampling_logp_difference/max": 0.4933648109436035, + "sampling/sampling_logp_difference/mean": 0.014038465917110443, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 225.203125, + "completions/mean_terminated_length": 225.203125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.5388938188552856, + "epoch": 2.008578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0824000288761453, + "kl": 0.05878060311079025, + "learning_rate": 2.995643971120243e-07, + "loss": -0.011, + "num_tokens": 51778683.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5637226104736328, + "sampling/importance_sampling_ratio/mean": 1.0000779628753662, + "sampling/importance_sampling_ratio/min": 0.3701205253601074, + "sampling/sampling_logp_difference/max": 0.9939265847206116, + "sampling/sampling_logp_difference/mean": 0.01762264594435692, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 176.21875, + "completions/mean_terminated_length": 176.21875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.40344706177711487, + "epoch": 2.0098039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9937570432346224, + "kl": 0.04136580601334572, + "learning_rate": 2.9891196558379126e-07, + "loss": 0.0262, + "num_tokens": 51807257.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4272184371948242, + "sampling/importance_sampling_ratio/mean": 1.000483751296997, + "sampling/importance_sampling_ratio/min": 0.6946358680725098, + "sampling/sampling_logp_difference/max": 0.3643674850463867, + "sampling/sampling_logp_difference/mean": 0.01589183695614338, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 191.734375, + "completions/mean_terminated_length": 191.734375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3914809823036194, + "epoch": 2.011029411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018623118788042722, + "kl": 0.030127104371786118, + "learning_rate": 2.9825994225152884e-07, + "loss": 0.0003, + "num_tokens": 51835816.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.651085376739502, + "sampling/importance_sampling_ratio/mean": 0.9999833703041077, + "sampling/importance_sampling_ratio/min": 0.617863655090332, + "sampling/sampling_logp_difference/max": 0.5014328956604004, + "sampling/sampling_logp_difference/mean": 0.014743650332093239, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 224.09375, + "completions/mean_terminated_length": 224.09375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4045575261116028, + "epoch": 2.0122549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01935921996649146, + "kl": 0.04010814055800438, + "learning_rate": 2.976083284388031e-07, + "loss": 0.0004, + "num_tokens": 51868526.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7385188341140747, + "sampling/importance_sampling_ratio/mean": 0.9998279213905334, + "sampling/importance_sampling_ratio/min": 0.6948853731155396, + "sampling/sampling_logp_difference/max": 0.5530334711074829, + "sampling/sampling_logp_difference/mean": 0.015155438333749771, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 268.265625, + "completions/mean_terminated_length": 268.265625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.4279055595397949, + "epoch": 2.013480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7053391349246472, + "kl": 0.039207860827445984, + "learning_rate": 2.9695712546834885e-07, + "loss": -0.0033, + "num_tokens": 51913343.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.371711254119873, + "sampling/importance_sampling_ratio/mean": 0.9999764561653137, + "sampling/importance_sampling_ratio/min": 0.6298378109931946, + "sampling/sampling_logp_difference/max": 0.4622929096221924, + "sampling/sampling_logp_difference/mean": 0.01395304873585701, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 224.65625, + "completions/mean_terminated_length": 224.65625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3981042206287384, + "epoch": 2.014705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01815549718527519, + "kl": 0.029402870684862137, + "learning_rate": 2.9630633466206655e-07, + "loss": 0.0003, + "num_tokens": 51948761.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3368773460388184, + "sampling/importance_sampling_ratio/mean": 1.0000650882720947, + "sampling/importance_sampling_ratio/min": 0.6909277439117432, + "sampling/sampling_logp_difference/max": 0.3697199821472168, + "sampling/sampling_logp_difference/mean": 0.01399196032434702, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 190.328125, + "completions/mean_terminated_length": 190.328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.44557228684425354, + "epoch": 2.0159313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9642542086273165, + "kl": 0.032532282173633575, + "learning_rate": 2.9565595734102043e-07, + "loss": -0.0098, + "num_tokens": 51978094.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.607919692993164, + "sampling/importance_sampling_ratio/mean": 1.0002827644348145, + "sampling/importance_sampling_ratio/min": 0.6513573527336121, + "sampling/sampling_logp_difference/max": 0.4749412536621094, + "sampling/sampling_logp_difference/mean": 0.015849046409130096, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 184.046875, + "completions/mean_terminated_length": 184.046875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3923611044883728, + "epoch": 2.017156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1044367314621921, + "kl": 0.03323255106806755, + "learning_rate": 2.950059948254355e-07, + "loss": -0.0253, + "num_tokens": 52007409.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4352164268493652, + "sampling/importance_sampling_ratio/mean": 1.0003793239593506, + "sampling/importance_sampling_ratio/min": 0.6964589953422546, + "sampling/sampling_logp_difference/max": 0.36174631118774414, + "sampling/sampling_logp_difference/mean": 0.014819534495472908, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 243.40625, + "completions/mean_terminated_length": 243.40625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.37322890758514404, + "epoch": 2.0183823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016663914336851788, + "kl": 0.025287559255957603, + "learning_rate": 2.943564484346943e-07, + "loss": 0.0002, + "num_tokens": 52043099.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003478527069092, + "sampling/importance_sampling_ratio/min": 0.11109285801649094, + "sampling/sampling_logp_difference/max": 2.1973888874053955, + "sampling/sampling_logp_difference/mean": 0.014352137222886086, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 200.421875, + "completions/mean_terminated_length": 200.421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.44276976585388184, + "epoch": 2.019607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9201914178971083, + "kl": 0.036136072129011154, + "learning_rate": 2.937073194873348e-07, + "loss": 0.0091, + "num_tokens": 52074758.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5517765283584595, + "sampling/importance_sampling_ratio/mean": 1.0003318786621094, + "sampling/importance_sampling_ratio/min": 0.6313734650611877, + "sampling/sampling_logp_difference/max": 0.459857702255249, + "sampling/sampling_logp_difference/mean": 0.016168639063835144, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 265.453125, + "completions/mean_terminated_length": 265.453125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.55025714635849, + "epoch": 2.0208333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9591424855913173, + "kl": 0.056286685168743134, + "learning_rate": 2.930586093010477e-07, + "loss": 0.0432, + "num_tokens": 52107523.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.531273603439331, + "sampling/importance_sampling_ratio/mean": 0.9996803998947144, + "sampling/importance_sampling_ratio/min": 0.6929144859313965, + "sampling/sampling_logp_difference/max": 0.4260997772216797, + "sampling/sampling_logp_difference/mean": 0.016547497361898422, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 193.15625, + "completions/mean_terminated_length": 193.15625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.43626347184181213, + "epoch": 2.0220588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05836585662966822, + "kl": 0.05067119002342224, + "learning_rate": 2.9241031919267363e-07, + "loss": 0.0005, + "num_tokens": 52133965.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4176301956176758, + "sampling/importance_sampling_ratio/mean": 0.9999895095825195, + "sampling/importance_sampling_ratio/min": 0.2822890281677246, + "sampling/sampling_logp_difference/max": 1.2648237943649292, + "sampling/sampling_logp_difference/mean": 0.01583799347281456, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 275.4375, + "completions/mean_terminated_length": 275.4375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.39282792806625366, + "epoch": 2.0232843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6110182406368218, + "kl": 0.02741451933979988, + "learning_rate": 2.917624504782006e-07, + "loss": 0.0202, + "num_tokens": 52176137.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998013377189636, + "sampling/importance_sampling_ratio/min": 0.6771509051322937, + "sampling/sampling_logp_difference/max": 1.1542718410491943, + "sampling/sampling_logp_difference/mean": 0.013893187046051025, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 200.640625, + "completions/mean_terminated_length": 200.640625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.390971302986145, + "epoch": 2.0245098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029704465085714463, + "kl": 0.03557724878191948, + "learning_rate": 2.911150044727605e-07, + "loss": 0.0003, + "num_tokens": 52212066.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5323922634124756, + "sampling/importance_sampling_ratio/mean": 1.0003458261489868, + "sampling/importance_sampling_ratio/min": 0.6090294122695923, + "sampling/sampling_logp_difference/max": 0.49588871002197266, + "sampling/sampling_logp_difference/mean": 0.015440979041159153, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 184.359375, + "completions/mean_terminated_length": 184.359375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.44655194878578186, + "epoch": 2.025735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021481327382425034, + "kl": 0.03439909219741821, + "learning_rate": 2.9046798249062824e-07, + "loss": 0.0003, + "num_tokens": 52244777.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6057103872299194, + "sampling/importance_sampling_ratio/mean": 1.0004479885101318, + "sampling/importance_sampling_ratio/min": 0.7303872108459473, + "sampling/sampling_logp_difference/max": 0.47356629371643066, + "sampling/sampling_logp_difference/mean": 0.01628941111266613, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 260.484375, + "completions/mean_terminated_length": 260.484375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.43018054962158203, + "epoch": 2.0269607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6781226854264821, + "kl": 0.03034813329577446, + "learning_rate": 2.898213858452173e-07, + "loss": -0.0181, + "num_tokens": 52280520.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4397597312927246, + "sampling/importance_sampling_ratio/mean": 0.9999618530273438, + "sampling/importance_sampling_ratio/min": 0.6438822746276855, + "sampling/sampling_logp_difference/max": 0.44023942947387695, + "sampling/sampling_logp_difference/mean": 0.014018382877111435, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 185.390625, + "completions/mean_terminated_length": 185.390625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3812934458255768, + "epoch": 2.028186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020154181701656497, + "kl": 0.034681811928749084, + "learning_rate": 2.891752158490778e-07, + "loss": 0.0003, + "num_tokens": 52309889.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6463236808776855, + "sampling/importance_sampling_ratio/mean": 0.9999797344207764, + "sampling/importance_sampling_ratio/min": 0.6622359156608582, + "sampling/sampling_logp_difference/max": 0.49854469299316406, + "sampling/sampling_logp_difference/mean": 0.013666651211678982, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 296.59375, + "completions/mean_terminated_length": 296.59375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.4506591260433197, + "epoch": 2.0294117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9232126386733178, + "kl": 0.04330100864171982, + "learning_rate": 2.8852947381389405e-07, + "loss": 0.0557, + "num_tokens": 52348679.0, + "reward": 0.6875, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.952916145324707, + "sampling/importance_sampling_ratio/mean": 1.0001850128173828, + "sampling/importance_sampling_ratio/min": 0.6680294275283813, + "sampling/sampling_logp_difference/max": 0.6693236827850342, + "sampling/sampling_logp_difference/mean": 0.013749771751463413, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 242.203125, + "completions/mean_terminated_length": 242.203125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.40169528126716614, + "epoch": 2.030637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.43990065422475777, + "kl": 0.033652957528829575, + "learning_rate": 2.8788416105048117e-07, + "loss": -0.0256, + "num_tokens": 52386756.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3932278156280518, + "sampling/importance_sampling_ratio/mean": 0.999697744846344, + "sampling/importance_sampling_ratio/min": 0.6413841843605042, + "sampling/sampling_logp_difference/max": 0.44412660598754883, + "sampling/sampling_logp_difference/mean": 0.013752281665802002, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 289.3125, + "completions/mean_terminated_length": 289.3125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.40133070945739746, + "epoch": 2.031862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017636936556527004, + "kl": 0.02483372949063778, + "learning_rate": 2.8723927886878396e-07, + "loss": 0.0002, + "num_tokens": 52424344.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3494555950164795, + "sampling/importance_sampling_ratio/mean": 1.0002000331878662, + "sampling/importance_sampling_ratio/min": 0.6392406225204468, + "sampling/sampling_logp_difference/max": 0.4474743604660034, + "sampling/sampling_logp_difference/mean": 0.014143183827400208, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 162.421875, + "completions/mean_terminated_length": 162.421875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.32380715012550354, + "epoch": 2.0330882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022417009948019623, + "kl": 0.03482438996434212, + "learning_rate": 2.865948285778713e-07, + "loss": 0.0003, + "num_tokens": 52446899.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3108482360839844, + "sampling/importance_sampling_ratio/mean": 0.9990267157554626, + "sampling/importance_sampling_ratio/min": 0.6505879759788513, + "sampling/sampling_logp_difference/max": 0.42987871170043945, + "sampling/sampling_logp_difference/mean": 0.013643001206219196, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 208.890625, + "completions/mean_terminated_length": 208.890625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3969336152076721, + "epoch": 2.034313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017978950074423048, + "kl": 0.028416968882083893, + "learning_rate": 2.8595081148593737e-07, + "loss": 0.0003, + "num_tokens": 52478204.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3293585777282715, + "sampling/importance_sampling_ratio/mean": 1.0005228519439697, + "sampling/importance_sampling_ratio/min": 0.7291538715362549, + "sampling/sampling_logp_difference/max": 0.3158705234527588, + "sampling/sampling_logp_difference/mean": 0.013309773057699203, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 183.515625, + "completions/mean_terminated_length": 183.515625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.4349133372306824, + "epoch": 2.0355392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8251669417316957, + "kl": 0.061835259199142456, + "learning_rate": 2.8530722890029534e-07, + "loss": 0.0128, + "num_tokens": 52506173.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.423336148262024, + "sampling/importance_sampling_ratio/mean": 0.9992690086364746, + "sampling/importance_sampling_ratio/min": 0.6807805895805359, + "sampling/sampling_logp_difference/max": 0.38451528549194336, + "sampling/sampling_logp_difference/mean": 0.015650738030672073, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 212.671875, + "completions/mean_terminated_length": 212.671875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.4570426046848297, + "epoch": 2.036764705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1227828641953626, + "kl": 0.074189193546772, + "learning_rate": 2.8466408212737776e-07, + "loss": 0.0187, + "num_tokens": 52535128.0, + "reward": 0.0625, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.629642128944397, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.44578036665916443, + "sampling/sampling_logp_difference/max": 0.8079289197921753, + "sampling/sampling_logp_difference/mean": 0.01558186300098896, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 247.625, + "completions/mean_terminated_length": 247.625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.4032864570617676, + "epoch": 2.0379901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8918525831586327, + "kl": 0.03512345626950264, + "learning_rate": 2.840213724727315e-07, + "loss": 0.0653, + "num_tokens": 52567776.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.0001716613769531, + "sampling/importance_sampling_ratio/min": 0.6171543002128601, + "sampling/sampling_logp_difference/max": 0.4826362133026123, + "sampling/sampling_logp_difference/mean": 0.014290999621152878, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 229.078125, + "completions/mean_terminated_length": 229.078125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.39683741331100464, + "epoch": 2.0392156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018637937810432412, + "kl": 0.02914087474346161, + "learning_rate": 2.8337910124101625e-07, + "loss": 0.0003, + "num_tokens": 52598501.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.484952688217163, + "sampling/importance_sampling_ratio/mean": 1.0001593828201294, + "sampling/importance_sampling_ratio/min": 0.6945454478263855, + "sampling/sampling_logp_difference/max": 0.3953828811645508, + "sampling/sampling_logp_difference/mean": 0.01495380885899067, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 241.359375, + "completions/mean_terminated_length": 241.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5107121467590332, + "epoch": 2.0404411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7397664210671164, + "kl": 0.1048065572977066, + "learning_rate": 2.8273726973600254e-07, + "loss": -0.0008, + "num_tokens": 52634892.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4599852561950684, + "sampling/importance_sampling_ratio/mean": 1.0000249147415161, + "sampling/importance_sampling_ratio/min": 0.6448967456817627, + "sampling/sampling_logp_difference/max": 0.4386650323867798, + "sampling/sampling_logp_difference/mean": 0.017005791887640953, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 214.640625, + "completions/mean_terminated_length": 214.640625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.44128990173339844, + "epoch": 2.0416666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019255397438440618, + "kl": 0.029830558225512505, + "learning_rate": 2.8209587926056687e-07, + "loss": 0.0003, + "num_tokens": 52669157.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5841492414474487, + "sampling/importance_sampling_ratio/mean": 1.0002622604370117, + "sampling/importance_sampling_ratio/min": 0.7802820205688477, + "sampling/sampling_logp_difference/max": 0.46004748344421387, + "sampling/sampling_logp_difference/mean": 0.015293768607079983, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 212.28125, + "completions/mean_terminated_length": 212.28125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4199128746986389, + "epoch": 2.042892156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7731108336925153, + "kl": 0.0362180694937706, + "learning_rate": 2.8145493111669183e-07, + "loss": -0.0057, + "num_tokens": 52698519.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8718528747558594, + "sampling/importance_sampling_ratio/mean": 0.9999653100967407, + "sampling/importance_sampling_ratio/min": 0.614104151725769, + "sampling/sampling_logp_difference/max": 0.6269288063049316, + "sampling/sampling_logp_difference/mean": 0.015863358974456787, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 245.078125, + "completions/mean_terminated_length": 245.078125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.3867439031600952, + "epoch": 2.0441176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01829240269316071, + "kl": 0.036024175584316254, + "learning_rate": 2.808144266054612e-07, + "loss": 0.0004, + "num_tokens": 52734428.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4562289714813232, + "sampling/importance_sampling_ratio/mean": 1.0001846551895142, + "sampling/importance_sampling_ratio/min": 0.6771669387817383, + "sampling/sampling_logp_difference/max": 0.389837384223938, + "sampling/sampling_logp_difference/mean": 0.012600721791386604, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 212.84375, + "completions/mean_terminated_length": 212.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.42237645387649536, + "epoch": 2.045343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0194288976480008, + "kl": 0.060419388115406036, + "learning_rate": 2.80174367027059e-07, + "loss": 0.0089, + "num_tokens": 52762674.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3720364570617676, + "sampling/importance_sampling_ratio/mean": 1.0000308752059937, + "sampling/importance_sampling_ratio/min": 0.7257882952690125, + "sampling/sampling_logp_difference/max": 0.32049691677093506, + "sampling/sampling_logp_difference/mean": 0.014276196248829365, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 211.734375, + "completions/mean_terminated_length": 211.734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.44351261854171753, + "epoch": 2.0465686274509802, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0851909857559852, + "kl": 0.059834472835063934, + "learning_rate": 2.795347536807653e-07, + "loss": -0.0173, + "num_tokens": 52790353.0, + "reward": 0.71875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997990131378174, + "sampling/importance_sampling_ratio/min": 0.6218530535697937, + "sampling/sampling_logp_difference/max": 0.6934218406677246, + "sampling/sampling_logp_difference/mean": 0.015442324802279472, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 235.34375, + "completions/mean_terminated_length": 235.34375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.491445392370224, + "epoch": 2.047794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.912618613009257, + "kl": 0.029086031019687653, + "learning_rate": 2.7889558786495455e-07, + "loss": 0.063, + "num_tokens": 52821719.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3871740102767944, + "sampling/importance_sampling_ratio/mean": 0.9998846054077148, + "sampling/importance_sampling_ratio/min": 0.6204485893249512, + "sampling/sampling_logp_difference/max": 0.4773125648498535, + "sampling/sampling_logp_difference/mean": 0.01591584086418152, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 219.875, + "completions/mean_terminated_length": 219.875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.31475579738616943, + "epoch": 2.049019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05779936415975806, + "kl": 0.03842802345752716, + "learning_rate": 2.782568708770933e-07, + "loss": 0.0004, + "num_tokens": 52853599.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.294599175453186, + "sampling/importance_sampling_ratio/mean": 1.0002093315124512, + "sampling/importance_sampling_ratio/min": 0.6579135060310364, + "sampling/sampling_logp_difference/max": 0.4186818599700928, + "sampling/sampling_logp_difference/mean": 0.012618561275303364, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 220.5, + "completions/mean_terminated_length": 220.5, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3651142716407776, + "epoch": 2.0502450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021175015253094443, + "kl": 0.03167567402124405, + "learning_rate": 2.7761860401373627e-07, + "loss": 0.0003, + "num_tokens": 52885103.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6355892419815063, + "sampling/importance_sampling_ratio/mean": 0.9996398687362671, + "sampling/importance_sampling_ratio/min": 0.6303655505180359, + "sampling/sampling_logp_difference/max": 0.4920032024383545, + "sampling/sampling_logp_difference/mean": 0.013990361243486404, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 240.796875, + "completions/mean_terminated_length": 240.796875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.37054872512817383, + "epoch": 2.051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6507139850082363, + "kl": 0.02488752454519272, + "learning_rate": 2.7698078857052474e-07, + "loss": -0.0175, + "num_tokens": 52914850.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6448520421981812, + "sampling/importance_sampling_ratio/mean": 0.9998887181282043, + "sampling/importance_sampling_ratio/min": 0.5966106653213501, + "sampling/sampling_logp_difference/max": 0.5164905786514282, + "sampling/sampling_logp_difference/mean": 0.013555832207202911, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 243.90625, + "completions/mean_terminated_length": 243.90625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.40336084365844727, + "epoch": 2.0526960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7497345971673042, + "kl": 0.039283160120248795, + "learning_rate": 2.763434258421836e-07, + "loss": -0.0003, + "num_tokens": 52948236.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.546738624572754, + "sampling/importance_sampling_ratio/mean": 1.0000205039978027, + "sampling/importance_sampling_ratio/min": 0.606302797794342, + "sampling/sampling_logp_difference/max": 0.5003757476806641, + "sampling/sampling_logp_difference/mean": 0.01379475649446249, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 225.25, + "completions/mean_terminated_length": 225.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3703897297382355, + "epoch": 2.053921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016045510247310706, + "kl": 0.025694692507386208, + "learning_rate": 2.757065171225192e-07, + "loss": 0.0002, + "num_tokens": 52977532.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4028879404067993, + "sampling/importance_sampling_ratio/mean": 1.000049114227295, + "sampling/importance_sampling_ratio/min": 0.6918835639953613, + "sampling/sampling_logp_difference/max": 0.36833763122558594, + "sampling/sampling_logp_difference/mean": 0.01374561432749033, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 207.421875, + "completions/mean_terminated_length": 207.421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.442902147769928, + "epoch": 2.0551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6825921709536202, + "kl": 0.05967596918344498, + "learning_rate": 2.750700637044155e-07, + "loss": 0.0041, + "num_tokens": 53007111.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4362882375717163, + "sampling/importance_sampling_ratio/mean": 1.0001070499420166, + "sampling/importance_sampling_ratio/min": 0.7374628186225891, + "sampling/sampling_logp_difference/max": 0.36206209659576416, + "sampling/sampling_logp_difference/mean": 0.013813722878694534, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 276.640625, + "completions/mean_terminated_length": 276.640625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.46242693066596985, + "epoch": 2.0563725490196076, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.819021851634093, + "kl": 0.04801500216126442, + "learning_rate": 2.7443406687983264e-07, + "loss": 0.0126, + "num_tokens": 53045504.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 0.9997797608375549, + "sampling/importance_sampling_ratio/min": 0.6948862671852112, + "sampling/sampling_logp_difference/max": 0.49199533462524414, + "sampling/sampling_logp_difference/mean": 0.014683485962450504, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 219.515625, + "completions/mean_terminated_length": 219.515625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4543115794658661, + "epoch": 2.0575980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027611047739936837, + "kl": 0.048169635236263275, + "learning_rate": 2.7379852793980416e-07, + "loss": 0.0005, + "num_tokens": 53076625.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4253244400024414, + "sampling/importance_sampling_ratio/mean": 1.000441074371338, + "sampling/importance_sampling_ratio/min": 0.6624775528907776, + "sampling/sampling_logp_difference/max": 0.41176867485046387, + "sampling/sampling_logp_difference/mean": 0.014917616732418537, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 278.03125, + "completions/mean_terminated_length": 278.03125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.5146213173866272, + "epoch": 2.0588235294117645, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8918473278469441, + "kl": 0.042962804436683655, + "learning_rate": 2.7316344817443363e-07, + "loss": -0.0202, + "num_tokens": 53113059.0, + "reward": 0.125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.478275179862976, + "sampling/importance_sampling_ratio/mean": 1.000291109085083, + "sampling/importance_sampling_ratio/min": 0.6280068159103394, + "sampling/sampling_logp_difference/max": 0.46520423889160156, + "sampling/sampling_logp_difference/mean": 0.015553226694464684, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 223.0, + "completions/mean_terminated_length": 223.0, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.42054322361946106, + "epoch": 2.060049019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.000222056642928, + "kl": 0.07231605052947998, + "learning_rate": 2.7252882887289287e-07, + "loss": -0.0323, + "num_tokens": 53143299.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4710259437561035, + "sampling/importance_sampling_ratio/mean": 0.999838650226593, + "sampling/importance_sampling_ratio/min": 0.6984808444976807, + "sampling/sampling_logp_difference/max": 0.38596010208129883, + "sampling/sampling_logp_difference/mean": 0.013918423093855381, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 174.3125, + "completions/mean_terminated_length": 174.3125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.40502676367759705, + "epoch": 2.0612745098039214, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02565488586677233, + "kl": 0.03213399276137352, + "learning_rate": 2.718946713234185e-07, + "loss": 0.0003, + "num_tokens": 53169831.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.590608835220337, + "sampling/importance_sampling_ratio/mean": 1.000605583190918, + "sampling/importance_sampling_ratio/min": 0.36815980076789856, + "sampling/sampling_logp_difference/max": 0.9992382526397705, + "sampling/sampling_logp_difference/mean": 0.015341555699706078, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 238.796875, + "completions/mean_terminated_length": 238.796875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.47911858558654785, + "epoch": 2.0625, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0922856363247173, + "kl": 0.0631026178598404, + "learning_rate": 2.712609768133106e-07, + "loss": -0.0219, + "num_tokens": 53209146.0, + "reward": 0.40625, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.393763780593872, + "sampling/importance_sampling_ratio/mean": 1.000108003616333, + "sampling/importance_sampling_ratio/min": 0.7464548945426941, + "sampling/sampling_logp_difference/max": 0.33200788497924805, + "sampling/sampling_logp_difference/mean": 0.015322668477892876, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 239.5, + "completions/mean_terminated_length": 239.5, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.4741671085357666, + "epoch": 2.063725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7921750118960504, + "kl": 0.027647310867905617, + "learning_rate": 2.7062774662892886e-07, + "loss": -0.0144, + "num_tokens": 53245818.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5185213088989258, + "sampling/importance_sampling_ratio/mean": 1.0000309944152832, + "sampling/importance_sampling_ratio/min": 0.7082099914550781, + "sampling/sampling_logp_difference/max": 0.4177370071411133, + "sampling/sampling_logp_difference/mean": 0.015398973599076271, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 252.234375, + "completions/mean_terminated_length": 252.234375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.4754703640937805, + "epoch": 2.064950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5951646185760013, + "kl": 0.050365347415208817, + "learning_rate": 2.6999498205569e-07, + "loss": 0.0316, + "num_tokens": 53279993.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.433864951133728, + "sampling/importance_sampling_ratio/mean": 1.0001448392868042, + "sampling/importance_sampling_ratio/min": 0.6685059666633606, + "sampling/sampling_logp_difference/max": 0.4027099609375, + "sampling/sampling_logp_difference/mean": 0.015721773728728294, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.4598179757595062, + "epoch": 2.0661764705882355, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9812591949291634, + "kl": 0.08628267049789429, + "learning_rate": 2.693626843780665e-07, + "loss": -0.0056, + "num_tokens": 53310809.0, + "reward": 0.09375, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5081480741500854, + "sampling/importance_sampling_ratio/mean": 0.9999203681945801, + "sampling/importance_sampling_ratio/min": 0.7070103287696838, + "sampling/sampling_logp_difference/max": 0.41088247299194336, + "sampling/sampling_logp_difference/mean": 0.015428414568305016, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 226.40625, + "completions/mean_terminated_length": 226.40625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.392291396856308, + "epoch": 2.0674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016348212507894044, + "kl": 0.03218259662389755, + "learning_rate": 2.687308548795825e-07, + "loss": 0.0003, + "num_tokens": 53344883.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4507747888565063, + "sampling/importance_sampling_ratio/mean": 1.0000643730163574, + "sampling/importance_sampling_ratio/min": 0.6488705277442932, + "sampling/sampling_logp_difference/max": 0.4325220584869385, + "sampling/sampling_logp_difference/mean": 0.014131030067801476, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 241.296875, + "completions/mean_terminated_length": 241.296875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.47790658473968506, + "epoch": 2.0686274509803924, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2041471119046432, + "kl": 0.06056524068117142, + "learning_rate": 2.6809949484281164e-07, + "loss": -0.0086, + "num_tokens": 53386726.0, + "reward": 0.0, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5972397327423096, + "sampling/importance_sampling_ratio/mean": 0.9999131560325623, + "sampling/importance_sampling_ratio/min": 0.5408050417900085, + "sampling/sampling_logp_difference/max": 0.6146965026855469, + "sampling/sampling_logp_difference/mean": 0.016169361770153046, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 238.328125, + "completions/mean_terminated_length": 238.328125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.4980003535747528, + "epoch": 2.0698529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1714652987900576, + "kl": 0.05430366098880768, + "learning_rate": 2.674686055493748e-07, + "loss": 0.0123, + "num_tokens": 53419739.0, + "reward": 0.3125, + "reward_std": 0.42898139357566833, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.2911043167114258, + "sampling/importance_sampling_ratio/mean": 1.0000584125518799, + "sampling/importance_sampling_ratio/min": 0.7557491660118103, + "sampling/sampling_logp_difference/max": 0.2800458073616028, + "sampling/sampling_logp_difference/mean": 0.016370002180337906, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 281.78125, + "completions/mean_terminated_length": 281.78125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.49693894386291504, + "epoch": 2.071078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5669572112070449, + "kl": 0.03916829079389572, + "learning_rate": 2.668381882799375e-07, + "loss": 0.0005, + "num_tokens": 53455693.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.3657170534133911, + "sampling/importance_sampling_ratio/mean": 0.9997628927230835, + "sampling/importance_sampling_ratio/min": 0.6173994541168213, + "sampling/sampling_logp_difference/max": 0.4822390079498291, + "sampling/sampling_logp_difference/mean": 0.015329066663980484, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 213.1875, + "completions/mean_terminated_length": 213.1875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.38387399911880493, + "epoch": 2.0723039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018357899831684923, + "kl": 0.028213728219270706, + "learning_rate": 2.662082443142068e-07, + "loss": 0.0003, + "num_tokens": 53485609.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5345168113708496, + "sampling/importance_sampling_ratio/mean": 1.0005815029144287, + "sampling/importance_sampling_ratio/min": 0.6108310222625732, + "sampling/sampling_logp_difference/max": 0.4929349422454834, + "sampling/sampling_logp_difference/mean": 0.01430382952094078, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 240.0625, + "completions/mean_terminated_length": 240.0625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.36599016189575195, + "epoch": 2.073529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025775944282998488, + "kl": 0.03330044820904732, + "learning_rate": 2.6557877493092883e-07, + "loss": 0.0003, + "num_tokens": 53518045.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996206760406494, + "sampling/importance_sampling_ratio/min": 0.3689557909965515, + "sampling/sampling_logp_difference/max": 0.9970784187316895, + "sampling/sampling_logp_difference/mean": 0.013810910284519196, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 239.234375, + "completions/mean_terminated_length": 239.234375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.3981662392616272, + "epoch": 2.0747549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7185263627841968, + "kl": 0.04459173604846001, + "learning_rate": 2.6494978140788686e-07, + "loss": 0.0165, + "num_tokens": 53549660.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3532410860061646, + "sampling/importance_sampling_ratio/mean": 1.0003100633621216, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.014453582465648651, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 231.546875, + "completions/mean_terminated_length": 231.546875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4637501835823059, + "epoch": 2.075980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5817636462228674, + "kl": 0.0697750449180603, + "learning_rate": 2.643212650218976e-07, + "loss": 0.0174, + "num_tokens": 53583007.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5472601652145386, + "sampling/importance_sampling_ratio/mean": 0.9998758435249329, + "sampling/importance_sampling_ratio/min": 0.6351385712623596, + "sampling/sampling_logp_difference/max": 0.45391201972961426, + "sampling/sampling_logp_difference/mean": 0.01557945366948843, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 245.734375, + "completions/mean_terminated_length": 245.734375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4809524714946747, + "epoch": 2.077205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9827785276513454, + "kl": 0.033042408525943756, + "learning_rate": 2.6369322704881e-07, + "loss": 0.0039, + "num_tokens": 53620206.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.910003423690796, + "sampling/importance_sampling_ratio/mean": 1.0003118515014648, + "sampling/importance_sampling_ratio/min": 0.6336769461631775, + "sampling/sampling_logp_difference/max": 0.6471049785614014, + "sampling/sampling_logp_difference/mean": 0.016739681363105774, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 277.6875, + "completions/mean_terminated_length": 277.6875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3514638841152191, + "epoch": 2.0784313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5419177648619975, + "kl": 0.028552263975143433, + "learning_rate": 2.6306566876350067e-07, + "loss": 0.0003, + "num_tokens": 53663482.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.546760082244873, + "sampling/importance_sampling_ratio/mean": 1.0000574588775635, + "sampling/importance_sampling_ratio/min": 0.683687150478363, + "sampling/sampling_logp_difference/max": 0.43616247177124023, + "sampling/sampling_logp_difference/mean": 0.012605156749486923, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 190.9375, + "completions/mean_terminated_length": 190.9375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3160659670829773, + "epoch": 2.079656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020635832829643035, + "kl": 0.0393621064722538, + "learning_rate": 2.6243859143987367e-07, + "loss": 0.0003, + "num_tokens": 53688582.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4020789861679077, + "sampling/importance_sampling_ratio/mean": 1.0002716779708862, + "sampling/importance_sampling_ratio/min": 0.7184279561042786, + "sampling/sampling_logp_difference/max": 0.3379560708999634, + "sampling/sampling_logp_difference/mean": 0.01307186484336853, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 241.546875, + "completions/mean_terminated_length": 241.546875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.4061009883880615, + "epoch": 2.0808823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030250847742007314, + "kl": 0.05404685065150261, + "learning_rate": 2.6181199635085616e-07, + "loss": 0.0006, + "num_tokens": 53718937.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4975636005401611, + "sampling/importance_sampling_ratio/mean": 1.0002026557922363, + "sampling/importance_sampling_ratio/min": 0.6254689693450928, + "sampling/sampling_logp_difference/max": 0.4692535400390625, + "sampling/sampling_logp_difference/mean": 0.014528755098581314, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 272.140625, + "completions/mean_terminated_length": 272.140625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.41196760535240173, + "epoch": 2.082107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01856909248973148, + "kl": 0.02570771798491478, + "learning_rate": 2.6118588476839607e-07, + "loss": 0.0003, + "num_tokens": 53753954.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5613126754760742, + "sampling/importance_sampling_ratio/mean": 0.999936580657959, + "sampling/importance_sampling_ratio/min": 0.6733855605125427, + "sampling/sampling_logp_difference/max": 0.44552695751190186, + "sampling/sampling_logp_difference/mean": 0.014957739971578121, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 178.28125, + "completions/mean_terminated_length": 178.28125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3256189227104187, + "epoch": 2.0833333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03437947921858951, + "kl": 0.040330491960048676, + "learning_rate": 2.6056025796346094e-07, + "loss": 0.0004, + "num_tokens": 53781940.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6014013290405273, + "sampling/importance_sampling_ratio/mean": 1.0005395412445068, + "sampling/importance_sampling_ratio/min": 0.6476975679397583, + "sampling/sampling_logp_difference/max": 0.47087907791137695, + "sampling/sampling_logp_difference/mean": 0.013887629844248295, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 217.3125, + "completions/mean_terminated_length": 217.3125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.42812711000442505, + "epoch": 2.0845588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6869708520780327, + "kl": 0.05290180817246437, + "learning_rate": 2.599351172060329e-07, + "loss": 0.0003, + "num_tokens": 53813640.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4987599849700928, + "sampling/importance_sampling_ratio/mean": 0.9994872808456421, + "sampling/importance_sampling_ratio/min": 0.5672615170478821, + "sampling/sampling_logp_difference/max": 0.5669348239898682, + "sampling/sampling_logp_difference/mean": 0.015493819490075111, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 284.328125, + "completions/mean_terminated_length": 284.328125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.4446394741535187, + "epoch": 2.0857843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7015076939413134, + "kl": 0.035604920238256454, + "learning_rate": 2.593104637651087e-07, + "loss": 0.0113, + "num_tokens": 53851597.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8148858547210693, + "sampling/importance_sampling_ratio/mean": 1.0000540018081665, + "sampling/importance_sampling_ratio/min": 0.6622360348701477, + "sampling/sampling_logp_difference/max": 0.5960226058959961, + "sampling/sampling_logp_difference/mean": 0.014582192525267601, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 219.421875, + "completions/mean_terminated_length": 219.421875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.43588775396347046, + "epoch": 2.0870098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8601369733440415, + "kl": 0.04138219356536865, + "learning_rate": 2.5868629890869463e-07, + "loss": 0.0365, + "num_tokens": 53883176.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4993034601211548, + "sampling/importance_sampling_ratio/mean": 0.999489963054657, + "sampling/importance_sampling_ratio/min": 0.6461346745491028, + "sampling/sampling_logp_difference/max": 0.4367474317550659, + "sampling/sampling_logp_difference/mean": 0.015711303800344467, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 262.3125, + "completions/mean_terminated_length": 262.3125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3219606876373291, + "epoch": 2.088235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014227525741447764, + "kl": 0.02282378077507019, + "learning_rate": 2.580626239038061e-07, + "loss": 0.0002, + "num_tokens": 53917260.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6047582626342773, + "sampling/importance_sampling_ratio/mean": 1.0003926753997803, + "sampling/importance_sampling_ratio/min": 0.67872554063797, + "sampling/sampling_logp_difference/max": 0.472973108291626, + "sampling/sampling_logp_difference/mean": 0.012733912095427513, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 267.40625, + "completions/mean_terminated_length": 267.40625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.44568419456481934, + "epoch": 2.0894607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6986790698162758, + "kl": 0.03710407763719559, + "learning_rate": 2.5743944001646387e-07, + "loss": 0.0135, + "num_tokens": 53955110.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.408916711807251, + "sampling/importance_sampling_ratio/mean": 0.9997654557228088, + "sampling/importance_sampling_ratio/min": 0.5261492133140564, + "sampling/sampling_logp_difference/max": 0.6421704292297363, + "sampling/sampling_logp_difference/mean": 0.015230206772685051, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 181.984375, + "completions/mean_terminated_length": 181.984375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.39598989486694336, + "epoch": 2.090686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023557917460605267, + "kl": 0.036407310515642166, + "learning_rate": 2.568167485116919e-07, + "loss": 0.0003, + "num_tokens": 53987349.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3268274068832397, + "sampling/importance_sampling_ratio/mean": 0.9992355704307556, + "sampling/importance_sampling_ratio/min": 0.6835634112358093, + "sampling/sampling_logp_difference/max": 0.3804359436035156, + "sampling/sampling_logp_difference/mean": 0.015313874930143356, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 194.484375, + "completions/mean_terminated_length": 194.484375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.46629300713539124, + "epoch": 2.0919117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8651152702445029, + "kl": 0.10407014191150665, + "learning_rate": 2.5619455065351435e-07, + "loss": 0.0199, + "num_tokens": 54019940.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5571781396865845, + "sampling/importance_sampling_ratio/mean": 0.9995058178901672, + "sampling/importance_sampling_ratio/min": 0.6799058318138123, + "sampling/sampling_logp_difference/max": 0.4428752660751343, + "sampling/sampling_logp_difference/mean": 0.016040362417697906, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 176.390625, + "completions/mean_terminated_length": 176.390625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2929573059082031, + "epoch": 2.093137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022889010555841464, + "kl": 0.03555411472916603, + "learning_rate": 2.555728477049532e-07, + "loss": 0.0004, + "num_tokens": 54046653.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.611952304840088, + "sampling/importance_sampling_ratio/mean": 0.9993224143981934, + "sampling/importance_sampling_ratio/min": 0.5149512887001038, + "sampling/sampling_logp_difference/max": 0.6636829376220703, + "sampling/sampling_logp_difference/mean": 0.012265045195817947, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 184.6875, + "completions/mean_terminated_length": 184.6875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3818640410900116, + "epoch": 2.094362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017828986446728298, + "kl": 0.029479539021849632, + "learning_rate": 2.5495164092802646e-07, + "loss": 0.0003, + "num_tokens": 54077881.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5072616338729858, + "sampling/importance_sampling_ratio/mean": 0.9999698400497437, + "sampling/importance_sampling_ratio/min": 0.7301764488220215, + "sampling/sampling_logp_difference/max": 0.4102945327758789, + "sampling/sampling_logp_difference/mean": 0.014310354366898537, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 222.484375, + "completions/mean_terminated_length": 222.484375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3059423565864563, + "epoch": 2.0955882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5902274709945375, + "kl": 0.02456137351691723, + "learning_rate": 2.5433093158374437e-07, + "loss": 0.013, + "num_tokens": 54109768.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5791244506835938, + "sampling/importance_sampling_ratio/mean": 0.9997508525848389, + "sampling/importance_sampling_ratio/min": 0.5916173458099365, + "sampling/sampling_logp_difference/max": 0.524895191192627, + "sampling/sampling_logp_difference/mean": 0.011942279525101185, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 258.5625, + "completions/mean_terminated_length": 258.5625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.40893906354904175, + "epoch": 2.096813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6073843168681505, + "kl": 0.028748570010066032, + "learning_rate": 2.537107209321074e-07, + "loss": -0.0074, + "num_tokens": 54146076.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4350489377975464, + "sampling/importance_sampling_ratio/mean": 0.9998239874839783, + "sampling/importance_sampling_ratio/min": 0.6944383382797241, + "sampling/sampling_logp_difference/max": 0.3646519184112549, + "sampling/sampling_logp_difference/mean": 0.014718063175678253, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 196.328125, + "completions/mean_terminated_length": 196.328125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.4684864282608032, + "epoch": 2.0980392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8973405832334115, + "kl": 0.05926259234547615, + "learning_rate": 2.5309101023210424e-07, + "loss": -0.01, + "num_tokens": 54174337.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.488796591758728, + "sampling/importance_sampling_ratio/mean": 1.0010432004928589, + "sampling/importance_sampling_ratio/min": 0.6786234378814697, + "sampling/sampling_logp_difference/max": 0.3979681730270386, + "sampling/sampling_logp_difference/mean": 0.01676376909017563, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 213.40625, + "completions/mean_terminated_length": 213.40625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3964466452598572, + "epoch": 2.099264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02150098145118524, + "kl": 0.03202517330646515, + "learning_rate": 2.524718007417081e-07, + "loss": 0.0003, + "num_tokens": 54205211.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5897349119186401, + "sampling/importance_sampling_ratio/mean": 0.99957674741745, + "sampling/importance_sampling_ratio/min": 0.6785008311271667, + "sampling/sampling_logp_difference/max": 0.46356725692749023, + "sampling/sampling_logp_difference/mean": 0.01471491064876318, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 234.0625, + "completions/mean_terminated_length": 234.0625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.4208112955093384, + "epoch": 2.1004901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8814452155258764, + "kl": 0.03885569050908089, + "learning_rate": 2.518530937178751e-07, + "loss": 0.0124, + "num_tokens": 54241231.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.000196933746338, + "sampling/importance_sampling_ratio/min": 0.6938435435295105, + "sampling/sampling_logp_difference/max": 0.4361441135406494, + "sampling/sampling_logp_difference/mean": 0.014252031221985817, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 233.90625, + "completions/mean_terminated_length": 233.90625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.4084511399269104, + "epoch": 2.1017156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025920117237784642, + "kl": 0.038195956498384476, + "learning_rate": 2.512348904165411e-07, + "loss": 0.0004, + "num_tokens": 54274217.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4753884077072144, + "sampling/importance_sampling_ratio/mean": 0.9999651908874512, + "sampling/importance_sampling_ratio/min": 0.4877993166446686, + "sampling/sampling_logp_difference/max": 0.7178511619567871, + "sampling/sampling_logp_difference/mean": 0.015018817037343979, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 168.21875, + "completions/mean_terminated_length": 168.21875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.33479487895965576, + "epoch": 2.1029411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023277676977274776, + "kl": 0.030932046473026276, + "learning_rate": 2.5061719209262e-07, + "loss": 0.0003, + "num_tokens": 54299799.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3236008882522583, + "sampling/importance_sampling_ratio/mean": 0.9999775886535645, + "sampling/importance_sampling_ratio/min": 0.6771601438522339, + "sampling/sampling_logp_difference/max": 0.3898475170135498, + "sampling/sampling_logp_difference/mean": 0.013862754218280315, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 197.546875, + "completions/mean_terminated_length": 197.546875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.4255238175392151, + "epoch": 2.1041666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022266585419841105, + "kl": 0.03658965975046158, + "learning_rate": 2.500000000000001e-07, + "loss": 0.0004, + "num_tokens": 54332122.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3816039562225342, + "sampling/importance_sampling_ratio/mean": 0.9997903108596802, + "sampling/importance_sampling_ratio/min": 0.6319694519042969, + "sampling/sampling_logp_difference/max": 0.45891427993774414, + "sampling/sampling_logp_difference/mean": 0.016192376613616943, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 162.0625, + "completions/mean_terminated_length": 162.0625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.26786911487579346, + "epoch": 2.105392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022903144719932732, + "kl": 0.027647744864225388, + "learning_rate": 2.49383315391542e-07, + "loss": 0.0003, + "num_tokens": 54356398.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4758132696151733, + "sampling/importance_sampling_ratio/mean": 0.9994162321090698, + "sampling/importance_sampling_ratio/min": 0.6632112264633179, + "sampling/sampling_logp_difference/max": 0.4106616973876953, + "sampling/sampling_logp_difference/mean": 0.012282269075512886, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 162.84375, + "completions/mean_terminated_length": 162.84375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.34371936321258545, + "epoch": 2.1066176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1992139792879184, + "kl": 0.045130655169487, + "learning_rate": 2.4876713951907685e-07, + "loss": -0.0381, + "num_tokens": 54382868.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5425312519073486, + "sampling/importance_sampling_ratio/mean": 0.9999926090240479, + "sampling/importance_sampling_ratio/min": 0.668948233127594, + "sampling/sampling_logp_difference/max": 0.433424711227417, + "sampling/sampling_logp_difference/mean": 0.014869507402181625, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 201.890625, + "completions/mean_terminated_length": 201.890625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.35909855365753174, + "epoch": 2.107843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.719253450724179, + "kl": 0.058807168155908585, + "learning_rate": 2.481514736334022e-07, + "loss": 0.0182, + "num_tokens": 54410109.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6629894971847534, + "sampling/importance_sampling_ratio/mean": 0.9998665452003479, + "sampling/importance_sampling_ratio/min": 0.6298564672470093, + "sampling/sampling_logp_difference/max": 0.5086169242858887, + "sampling/sampling_logp_difference/mean": 0.01432094443589449, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 223.65625, + "completions/mean_terminated_length": 223.65625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.44382837414741516, + "epoch": 2.1090686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042830580239373986, + "kl": 0.05109231919050217, + "learning_rate": 2.4753631898428134e-07, + "loss": 0.0005, + "num_tokens": 54444903.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4976780414581299, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.6262986660003662, + "sampling/sampling_logp_difference/max": 0.4679279327392578, + "sampling/sampling_logp_difference/mean": 0.015708569437265396, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 236.765625, + "completions/mean_terminated_length": 236.765625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.4542069137096405, + "epoch": 2.110294117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0527350717176063, + "kl": 0.038782984018325806, + "learning_rate": 2.4692167682043853e-07, + "loss": -0.043, + "num_tokens": 54490216.0, + "reward": 0.28125, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.4028143882751465, + "sampling/importance_sampling_ratio/mean": 0.9994980096817017, + "sampling/importance_sampling_ratio/min": 0.6208435297012329, + "sampling/sampling_logp_difference/max": 0.47667622566223145, + "sampling/sampling_logp_difference/mean": 0.016528688371181488, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 199.46875, + "completions/mean_terminated_length": 199.46875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.5112206339836121, + "epoch": 2.111519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6291184660248751, + "kl": 0.054802052676677704, + "learning_rate": 2.4630754838955896e-07, + "loss": -0.0041, + "num_tokens": 54519750.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5745244026184082, + "sampling/importance_sampling_ratio/mean": 0.9998692870140076, + "sampling/importance_sampling_ratio/min": 0.5896148085594177, + "sampling/sampling_logp_difference/max": 0.5282858610153198, + "sampling/sampling_logp_difference/mean": 0.017753848806023598, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 224.96875, + "completions/mean_terminated_length": 224.96875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.41554874181747437, + "epoch": 2.1127450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9198415011654026, + "kl": 0.06762229651212692, + "learning_rate": 2.456939349382843e-07, + "loss": 0.0265, + "num_tokens": 54552964.0, + "reward": 0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.3247536420822144, + "sampling/importance_sampling_ratio/mean": 1.0001225471496582, + "sampling/importance_sampling_ratio/min": 0.6661868691444397, + "sampling/sampling_logp_difference/max": 0.4061850309371948, + "sampling/sampling_logp_difference/mean": 0.014037791639566422, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 177.96875, + "completions/mean_terminated_length": 177.96875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.37972375750541687, + "epoch": 2.113970588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01885291795983761, + "kl": 0.028345579281449318, + "learning_rate": 2.450808377122107e-07, + "loss": 0.0003, + "num_tokens": 54582034.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.35982346534729, + "sampling/importance_sampling_ratio/mean": 1.0004932880401611, + "sampling/importance_sampling_ratio/min": 0.7064409852027893, + "sampling/sampling_logp_difference/max": 0.3475155830383301, + "sampling/sampling_logp_difference/mean": 0.014719485305249691, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 197.078125, + "completions/mean_terminated_length": 197.078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3369651436805725, + "epoch": 2.1151960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018305539672001096, + "kl": 0.02534719929099083, + "learning_rate": 2.4446825795588716e-07, + "loss": 0.0002, + "num_tokens": 54614135.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3517695665359497, + "sampling/importance_sampling_ratio/mean": 0.999598503112793, + "sampling/importance_sampling_ratio/min": 0.6802843809127808, + "sampling/sampling_logp_difference/max": 0.38524436950683594, + "sampling/sampling_logp_difference/mean": 0.013371542096138, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 242.765625, + "completions/mean_terminated_length": 242.765625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.38267743587493896, + "epoch": 2.116421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03252813883627779, + "kl": 0.043914198875427246, + "learning_rate": 2.438561969128114e-07, + "loss": 0.0005, + "num_tokens": 54649784.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071556568145752, + "sampling/importance_sampling_ratio/mean": 0.9998723268508911, + "sampling/importance_sampling_ratio/min": 0.6065890789031982, + "sampling/sampling_logp_difference/max": 0.49990367889404297, + "sampling/sampling_logp_difference/mean": 0.012969196774065495, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 170.390625, + "completions/mean_terminated_length": 170.390625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.39506852626800537, + "epoch": 2.1176470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0446569400289303, + "kl": 0.04495479539036751, + "learning_rate": 2.43244655825429e-07, + "loss": -0.0077, + "num_tokens": 54674865.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.6598515510559082, + "sampling/importance_sampling_ratio/mean": 1.0003743171691895, + "sampling/importance_sampling_ratio/min": 0.7296468019485474, + "sampling/sampling_logp_difference/max": 0.5067281723022461, + "sampling/sampling_logp_difference/mean": 0.016044920310378075, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 163.34375, + "completions/mean_terminated_length": 163.34375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.2899906039237976, + "epoch": 2.1188725490196076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027816125360010296, + "kl": 0.03442544862627983, + "learning_rate": 2.4263363593512903e-07, + "loss": 0.0003, + "num_tokens": 54700055.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4803112745285034, + "sampling/importance_sampling_ratio/mean": 0.9994245767593384, + "sampling/importance_sampling_ratio/min": 0.6799276471138, + "sampling/sampling_logp_difference/max": 0.39225244522094727, + "sampling/sampling_logp_difference/mean": 0.013562307693064213, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 195.96875, + "completions/mean_terminated_length": 195.96875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4244498610496521, + "epoch": 2.1200980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9310287504765901, + "kl": 0.038948509842157364, + "learning_rate": 2.4202313848224364e-07, + "loss": -0.0042, + "num_tokens": 54729957.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5012526512145996, + "sampling/importance_sampling_ratio/mean": 0.9999551177024841, + "sampling/importance_sampling_ratio/min": 0.6009517312049866, + "sampling/sampling_logp_difference/max": 0.5092406272888184, + "sampling/sampling_logp_difference/mean": 0.016196755692362785, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 273.15625, + "completions/mean_terminated_length": 273.15625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4847327470779419, + "epoch": 2.1213235294117645, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.015332241408022, + "kl": 0.0634094625711441, + "learning_rate": 2.414131647060436e-07, + "loss": 0.0162, + "num_tokens": 54773119.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003219842910767, + "sampling/importance_sampling_ratio/min": 0.6625234484672546, + "sampling/sampling_logp_difference/max": 0.9516277313232422, + "sampling/sampling_logp_difference/mean": 0.0151731688529253, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 193.09375, + "completions/mean_terminated_length": 193.09375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.38262057304382324, + "epoch": 2.122549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02093126405835027, + "kl": 0.02932485193014145, + "learning_rate": 2.4080371584473745e-07, + "loss": 0.0003, + "num_tokens": 54801349.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4112415313720703, + "sampling/importance_sampling_ratio/mean": 1.0002636909484863, + "sampling/importance_sampling_ratio/min": 0.6678059101104736, + "sampling/sampling_logp_difference/max": 0.4037576913833618, + "sampling/sampling_logp_difference/mean": 0.01606777496635914, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 222.375, + "completions/mean_terminated_length": 222.375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.3532024621963501, + "epoch": 2.123774509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028520888685228243, + "kl": 0.03388208895921707, + "learning_rate": 2.4019479313546757e-07, + "loss": 0.0003, + "num_tokens": 54840125.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3584169149398804, + "sampling/importance_sampling_ratio/mean": 0.9997031092643738, + "sampling/importance_sampling_ratio/min": 0.591215193271637, + "sampling/sampling_logp_difference/max": 0.5255752205848694, + "sampling/sampling_logp_difference/mean": 0.012861925177276134, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 209.453125, + "completions/mean_terminated_length": 209.453125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.4223269522190094, + "epoch": 2.125, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9339945437108128, + "kl": 0.045595258474349976, + "learning_rate": 2.395863978143083e-07, + "loss": -0.0341, + "num_tokens": 54879226.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000205159187317, + "sampling/importance_sampling_ratio/min": 0.6413000822067261, + "sampling/sampling_logp_difference/max": 0.9379068613052368, + "sampling/sampling_logp_difference/mean": 0.016056055203080177, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 187.0625, + "completions/mean_terminated_length": 187.0625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.32577162981033325, + "epoch": 2.126225490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02807865217048386, + "kl": 0.03253490477800369, + "learning_rate": 2.3897853111626417e-07, + "loss": 0.0003, + "num_tokens": 54911102.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6307547092437744, + "sampling/importance_sampling_ratio/mean": 1.0000345706939697, + "sampling/importance_sampling_ratio/min": 0.5113306641578674, + "sampling/sampling_logp_difference/max": 0.6707388162612915, + "sampling/sampling_logp_difference/mean": 0.012766007333993912, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 155.859375, + "completions/mean_terminated_length": 155.859375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.40753477811813354, + "epoch": 2.127450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03305307656653402, + "kl": 0.03920695185661316, + "learning_rate": 2.383711942752652e-07, + "loss": 0.0004, + "num_tokens": 54939541.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5626815557479858, + "sampling/importance_sampling_ratio/mean": 0.9999619722366333, + "sampling/importance_sampling_ratio/min": 0.7706030011177063, + "sampling/sampling_logp_difference/max": 0.44640326499938965, + "sampling/sampling_logp_difference/mean": 0.015735357999801636, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 226.140625, + "completions/mean_terminated_length": 226.140625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4630538523197174, + "epoch": 2.1286764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7251203505688483, + "kl": 0.05249332636594772, + "learning_rate": 2.377643885241674e-07, + "loss": -0.0052, + "num_tokens": 54977422.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4917874336242676, + "sampling/importance_sampling_ratio/mean": 1.0001999139785767, + "sampling/importance_sampling_ratio/min": 0.6482202410697937, + "sampling/sampling_logp_difference/max": 0.4335247278213501, + "sampling/sampling_logp_difference/mean": 0.015336014330387115, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 208.796875, + "completions/mean_terminated_length": 208.796875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.4857085943222046, + "epoch": 2.1299019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8431751513723151, + "kl": 0.04069080576300621, + "learning_rate": 2.371581150947476e-07, + "loss": 0.0155, + "num_tokens": 55009233.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3904049396514893, + "sampling/importance_sampling_ratio/mean": 0.999758243560791, + "sampling/importance_sampling_ratio/min": 0.6156575679779053, + "sampling/sampling_logp_difference/max": 0.48506438732147217, + "sampling/sampling_logp_difference/mean": 0.01667597144842148, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 236.828125, + "completions/mean_terminated_length": 236.828125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3865073323249817, + "epoch": 2.1311274509803924, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7945251971627644, + "kl": 0.03433482348918915, + "learning_rate": 2.3655237521770282e-07, + "loss": 0.0227, + "num_tokens": 55044326.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.44430410861969, + "sampling/importance_sampling_ratio/mean": 0.9998413324356079, + "sampling/importance_sampling_ratio/min": 0.639488697052002, + "sampling/sampling_logp_difference/max": 0.4470863342285156, + "sampling/sampling_logp_difference/mean": 0.013868868350982666, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 225.828125, + "completions/mean_terminated_length": 225.828125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.4157715141773224, + "epoch": 2.1323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023297379077723845, + "kl": 0.02754153311252594, + "learning_rate": 2.3594717012264642e-07, + "loss": 0.0003, + "num_tokens": 55078379.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9877310991287231, + "sampling/importance_sampling_ratio/mean": 1.000409483909607, + "sampling/importance_sampling_ratio/min": 0.5863022208213806, + "sampling/sampling_logp_difference/max": 0.6869938373565674, + "sampling/sampling_logp_difference/mean": 0.014839432202279568, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 226.671875, + "completions/mean_terminated_length": 226.671875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4221544563770294, + "epoch": 2.133578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016172086290548435, + "kl": 0.03135847672820091, + "learning_rate": 2.3534250103810627e-07, + "loss": 0.0003, + "num_tokens": 55111334.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7786999940872192, + "sampling/importance_sampling_ratio/mean": 1.0000792741775513, + "sampling/importance_sampling_ratio/min": 0.6914989948272705, + "sampling/sampling_logp_difference/max": 0.5758827924728394, + "sampling/sampling_logp_difference/mean": 0.015480948612093925, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 267.25, + "completions/mean_terminated_length": 267.25, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.4866619110107422, + "epoch": 2.1348039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0196562728494614, + "kl": 0.054346490651369095, + "learning_rate": 2.3473836919152263e-07, + "loss": -0.003, + "num_tokens": 55148918.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.8142759799957275, + "sampling/importance_sampling_ratio/mean": 1.0004303455352783, + "sampling/importance_sampling_ratio/min": 0.4147556722164154, + "sampling/sampling_logp_difference/max": 0.8800656795501709, + "sampling/sampling_logp_difference/mean": 0.016502011567354202, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 201.453125, + "completions/mean_terminated_length": 201.453125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.423952579498291, + "epoch": 2.136029411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7835543064481314, + "kl": 0.06791205704212189, + "learning_rate": 2.3413477580924475e-07, + "loss": 0.0385, + "num_tokens": 55179555.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4241691827774048, + "sampling/importance_sampling_ratio/mean": 1.0000262260437012, + "sampling/importance_sampling_ratio/min": 0.6093135476112366, + "sampling/sampling_logp_difference/max": 0.49542236328125, + "sampling/sampling_logp_difference/mean": 0.015081222169101238, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 184.328125, + "completions/mean_terminated_length": 184.328125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.34962934255599976, + "epoch": 2.1372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018699136234511006, + "kl": 0.026226745918393135, + "learning_rate": 2.3353172211652884e-07, + "loss": 0.0003, + "num_tokens": 55212616.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4477838277816772, + "sampling/importance_sampling_ratio/mean": 1.000114917755127, + "sampling/importance_sampling_ratio/min": 0.6632859706878662, + "sampling/sampling_logp_difference/max": 0.4105490446090698, + "sampling/sampling_logp_difference/mean": 0.013516898266971111, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 202.203125, + "completions/mean_terminated_length": 202.203125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3871409296989441, + "epoch": 2.138480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7748667693524827, + "kl": 0.043323516845703125, + "learning_rate": 2.329292093375356e-07, + "loss": -0.0124, + "num_tokens": 55241413.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5215389728546143, + "sampling/importance_sampling_ratio/mean": 1.0001754760742188, + "sampling/importance_sampling_ratio/min": 0.6299866437911987, + "sampling/sampling_logp_difference/max": 0.46205663681030273, + "sampling/sampling_logp_difference/mean": 0.014800415374338627, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 181.3125, + "completions/mean_terminated_length": 181.3125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.4962119162082672, + "epoch": 2.139705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030377870768547713, + "kl": 0.06676479429006577, + "learning_rate": 2.3232723869532816e-07, + "loss": 0.0007, + "num_tokens": 55271945.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4092849493026733, + "sampling/importance_sampling_ratio/mean": 0.9996241331100464, + "sampling/importance_sampling_ratio/min": 0.6529548168182373, + "sampling/sampling_logp_difference/max": 0.42624735832214355, + "sampling/sampling_logp_difference/mean": 0.016974840313196182, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 198.328125, + "completions/mean_terminated_length": 198.328125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4542536735534668, + "epoch": 2.1409313725490198, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1187618944155129, + "kl": 0.0751677006483078, + "learning_rate": 2.3172581141186858e-07, + "loss": -0.0114, + "num_tokens": 55299358.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4223324060440063, + "sampling/importance_sampling_ratio/mean": 0.9998506903648376, + "sampling/importance_sampling_ratio/min": 0.6970131993293762, + "sampling/sampling_logp_difference/max": 0.36095094680786133, + "sampling/sampling_logp_difference/mean": 0.016000093892216682, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 212.21875, + "completions/mean_terminated_length": 212.21875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.41988879442214966, + "epoch": 2.142156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025334713347893854, + "kl": 0.04268532991409302, + "learning_rate": 2.3112492870801602e-07, + "loss": 0.0004, + "num_tokens": 55333452.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4602017402648926, + "sampling/importance_sampling_ratio/mean": 0.9998769760131836, + "sampling/importance_sampling_ratio/min": 0.6052498817443848, + "sampling/sampling_logp_difference/max": 0.5021138191223145, + "sampling/sampling_logp_difference/mean": 0.015501865185797215, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 203.9375, + "completions/mean_terminated_length": 203.9375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.34786057472229004, + "epoch": 2.1433823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019621902407131974, + "kl": 0.03109024092555046, + "learning_rate": 2.3052459180352458e-07, + "loss": 0.0003, + "num_tokens": 55366120.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6022891998291016, + "sampling/importance_sampling_ratio/mean": 1.0002949237823486, + "sampling/importance_sampling_ratio/min": 0.6090093851089478, + "sampling/sampling_logp_difference/max": 0.49592161178588867, + "sampling/sampling_logp_difference/mean": 0.014146235771477222, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 265.09375, + "completions/mean_terminated_length": 265.09375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.3917834162712097, + "epoch": 2.144607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015290380997986797, + "kl": 0.024292241781949997, + "learning_rate": 2.2992480191704e-07, + "loss": 0.0002, + "num_tokens": 55407694.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9756038188934326, + "sampling/importance_sampling_ratio/mean": 1.0001131296157837, + "sampling/importance_sampling_ratio/min": 0.6060279607772827, + "sampling/sampling_logp_difference/max": 0.6808741092681885, + "sampling/sampling_logp_difference/mean": 0.013634631410241127, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 182.53125, + "completions/mean_terminated_length": 182.53125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.33123937249183655, + "epoch": 2.1458333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9168386050176408, + "kl": 0.043895699083805084, + "learning_rate": 2.2932556026609777e-07, + "loss": -0.0295, + "num_tokens": 55441120.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5277621746063232, + "sampling/importance_sampling_ratio/mean": 1.0001862049102783, + "sampling/importance_sampling_ratio/min": 0.6155057549476624, + "sampling/sampling_logp_difference/max": 0.48531103134155273, + "sampling/sampling_logp_difference/mean": 0.012891734018921852, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 257.296875, + "completions/mean_terminated_length": 257.296875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.4753401577472687, + "epoch": 2.1470588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029490820147152666, + "kl": 0.03417005389928818, + "learning_rate": 2.2872686806712032e-07, + "loss": 0.0004, + "num_tokens": 55479907.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4761298894882202, + "sampling/importance_sampling_ratio/mean": 1.000026822090149, + "sampling/importance_sampling_ratio/min": 0.6266134977340698, + "sampling/sampling_logp_difference/max": 0.4674253463745117, + "sampling/sampling_logp_difference/mean": 0.016516663134098053, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 219.28125, + "completions/mean_terminated_length": 219.28125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.5341153144836426, + "epoch": 2.1482843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02628875241651288, + "kl": 0.045024674385786057, + "learning_rate": 2.2812872653541498e-07, + "loss": 0.0005, + "num_tokens": 55518293.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3663583993911743, + "sampling/importance_sampling_ratio/mean": 1.0003935098648071, + "sampling/importance_sampling_ratio/min": 0.6626349687576294, + "sampling/sampling_logp_difference/max": 0.4115309715270996, + "sampling/sampling_logp_difference/mean": 0.017857030034065247, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 219.015625, + "completions/mean_terminated_length": 219.015625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4271315634250641, + "epoch": 2.1495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7242580390124498, + "kl": 0.052868179976940155, + "learning_rate": 2.2753113688517155e-07, + "loss": 0.0087, + "num_tokens": 55555750.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.579295039176941, + "sampling/importance_sampling_ratio/mean": 1.000077247619629, + "sampling/importance_sampling_ratio/min": 0.5911200046539307, + "sampling/sampling_logp_difference/max": 0.5257362127304077, + "sampling/sampling_logp_difference/mean": 0.014551831409335136, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 251.65625, + "completions/mean_terminated_length": 251.65625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3884498178958893, + "epoch": 2.150735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8531369846460369, + "kl": 0.02951735258102417, + "learning_rate": 2.2693410032945853e-07, + "loss": -0.0039, + "num_tokens": 55594352.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5480871200561523, + "sampling/importance_sampling_ratio/mean": 1.0000702142715454, + "sampling/importance_sampling_ratio/min": 0.7222326397895813, + "sampling/sampling_logp_difference/max": 0.43702006340026855, + "sampling/sampling_logp_difference/mean": 0.0142079321667552, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 198.078125, + "completions/mean_terminated_length": 198.078125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4085216820240021, + "epoch": 2.1519607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018627729681306757, + "kl": 0.02471754141151905, + "learning_rate": 2.2633761808022272e-07, + "loss": 0.0002, + "num_tokens": 55627093.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6256582736968994, + "sampling/importance_sampling_ratio/mean": 0.9996710419654846, + "sampling/importance_sampling_ratio/min": 0.6510365605354309, + "sampling/sampling_logp_difference/max": 0.4859127998352051, + "sampling/sampling_logp_difference/mean": 0.015194547362625599, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 162.53125, + "completions/mean_terminated_length": 162.53125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3991209864616394, + "epoch": 2.153186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7667242884706594, + "kl": 0.06161896884441376, + "learning_rate": 2.2574169134828526e-07, + "loss": -0.0191, + "num_tokens": 55652567.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.439162254333496, + "sampling/importance_sampling_ratio/mean": 1.0000481605529785, + "sampling/importance_sampling_ratio/min": 0.6192695498466492, + "sampling/sampling_logp_difference/max": 0.4792146682739258, + "sampling/sampling_logp_difference/mean": 0.01625451073050499, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 223.53125, + "completions/mean_terminated_length": 223.53125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.5019240379333496, + "epoch": 2.1544117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.601437594217833, + "kl": 0.0837329775094986, + "learning_rate": 2.2514632134333932e-07, + "loss": -0.0097, + "num_tokens": 55684553.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4870280027389526, + "sampling/importance_sampling_ratio/mean": 1.0001258850097656, + "sampling/importance_sampling_ratio/min": 0.637239933013916, + "sampling/sampling_logp_difference/max": 0.4506089687347412, + "sampling/sampling_logp_difference/mean": 0.017832860350608826, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 261.484375, + "completions/mean_terminated_length": 261.484375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3401559591293335, + "epoch": 2.155637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6481609273172343, + "kl": 0.033339258283376694, + "learning_rate": 2.2455150927394878e-07, + "loss": -0.0147, + "num_tokens": 55719624.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4729355573654175, + "sampling/importance_sampling_ratio/mean": 0.9997318983078003, + "sampling/importance_sampling_ratio/min": 0.629811704158783, + "sampling/sampling_logp_difference/max": 0.46233439445495605, + "sampling/sampling_logp_difference/mean": 0.011864010244607925, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 233.09375, + "completions/mean_terminated_length": 233.09375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.45578038692474365, + "epoch": 2.156862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8057610589224533, + "kl": 0.03374443203210831, + "learning_rate": 2.2395725634754402e-07, + "loss": 0.0215, + "num_tokens": 55754254.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4249826669692993, + "sampling/importance_sampling_ratio/mean": 1.0005931854248047, + "sampling/importance_sampling_ratio/min": 0.6372078061103821, + "sampling/sampling_logp_difference/max": 0.45065951347351074, + "sampling/sampling_logp_difference/mean": 0.016326431185007095, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 220.1875, + "completions/mean_terminated_length": 220.1875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3218801021575928, + "epoch": 2.1580882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01423016017040416, + "kl": 0.02158094383776188, + "learning_rate": 2.2336356377042143e-07, + "loss": 0.0002, + "num_tokens": 55783370.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.516823172569275, + "sampling/importance_sampling_ratio/mean": 0.9995269179344177, + "sampling/importance_sampling_ratio/min": 0.6208050847053528, + "sampling/sampling_logp_difference/max": 0.47673821449279785, + "sampling/sampling_logp_difference/mean": 0.013418269343674183, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 181.671875, + "completions/mean_terminated_length": 181.671875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.36425530910491943, + "epoch": 2.159313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029893002756899607, + "kl": 0.04176114499568939, + "learning_rate": 2.2277043274773854e-07, + "loss": 0.0004, + "num_tokens": 55813365.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007848978042603, + "sampling/importance_sampling_ratio/mean": 1.0001916885375977, + "sampling/importance_sampling_ratio/min": 0.6393733620643616, + "sampling/sampling_logp_difference/max": 0.47049403190612793, + "sampling/sampling_logp_difference/mean": 0.01389409601688385, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 169.78125, + "completions/mean_terminated_length": 169.78125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.33382275700569153, + "epoch": 2.1605392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020204153809200747, + "kl": 0.029215075075626373, + "learning_rate": 2.221778644835144e-07, + "loss": 0.0003, + "num_tokens": 55840471.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7818156480789185, + "sampling/importance_sampling_ratio/mean": 1.0002161264419556, + "sampling/importance_sampling_ratio/min": 0.6140713691711426, + "sampling/sampling_logp_difference/max": 0.5776329040527344, + "sampling/sampling_logp_difference/mean": 0.014868385158479214, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 176.390625, + "completions/mean_terminated_length": 176.390625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.35788238048553467, + "epoch": 2.161764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021212287763992144, + "kl": 0.028317486867308617, + "learning_rate": 2.215858601806246e-07, + "loss": 0.0003, + "num_tokens": 55867280.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5563019514083862, + "sampling/importance_sampling_ratio/mean": 0.9995063543319702, + "sampling/importance_sampling_ratio/min": 0.41623008251190186, + "sampling/sampling_logp_difference/max": 0.8765170574188232, + "sampling/sampling_logp_difference/mean": 0.014820320531725883, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 189.328125, + "completions/mean_terminated_length": 189.328125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.32203784584999084, + "epoch": 2.1629901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02518747976591494, + "kl": 0.027678757905960083, + "learning_rate": 2.2099442104080075e-07, + "loss": 0.0003, + "num_tokens": 55893717.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4755914211273193, + "sampling/importance_sampling_ratio/mean": 0.9997338652610779, + "sampling/importance_sampling_ratio/min": 0.6314525008201599, + "sampling/sampling_logp_difference/max": 0.4597325325012207, + "sampling/sampling_logp_difference/mean": 0.013499148190021515, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 239.65625, + "completions/mean_terminated_length": 239.65625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.46300262212753296, + "epoch": 2.1642156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7057487602303449, + "kl": 0.05219618231058121, + "learning_rate": 2.2040354826462664e-07, + "loss": 0.0082, + "num_tokens": 55930383.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6213703155517578, + "sampling/importance_sampling_ratio/mean": 0.9996943473815918, + "sampling/importance_sampling_ratio/min": 0.6223365664482117, + "sampling/sampling_logp_difference/max": 0.48327159881591797, + "sampling/sampling_logp_difference/mean": 0.01587051898241043, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 157.671875, + "completions/mean_terminated_length": 157.671875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3129045069217682, + "epoch": 2.1654411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023785586631170752, + "kl": 0.03676670044660568, + "learning_rate": 2.1981324305153642e-07, + "loss": 0.0003, + "num_tokens": 55955546.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6219816207885742, + "sampling/importance_sampling_ratio/mean": 1.00071382522583, + "sampling/importance_sampling_ratio/min": 0.7006001472473145, + "sampling/sampling_logp_difference/max": 0.48364853858947754, + "sampling/sampling_logp_difference/mean": 0.014173779636621475, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 235.3125, + "completions/mean_terminated_length": 235.3125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.49570128321647644, + "epoch": 2.1666666666666665, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.078351481905372, + "kl": 0.09133053570985794, + "learning_rate": 2.192235065998126e-07, + "loss": -0.0105, + "num_tokens": 55989134.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5448896884918213, + "sampling/importance_sampling_ratio/mean": 0.998965859413147, + "sampling/importance_sampling_ratio/min": 0.6119261384010315, + "sampling/sampling_logp_difference/max": 0.49114370346069336, + "sampling/sampling_logp_difference/mean": 0.017726827412843704, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 203.765625, + "completions/mean_terminated_length": 203.765625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3536602258682251, + "epoch": 2.167892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018581054929521598, + "kl": 0.024750471115112305, + "learning_rate": 2.1863434010658272e-07, + "loss": 0.0002, + "num_tokens": 56020303.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5407379865646362, + "sampling/importance_sampling_ratio/mean": 1.0003414154052734, + "sampling/importance_sampling_ratio/min": 0.7101253867149353, + "sampling/sampling_logp_difference/max": 0.43226146697998047, + "sampling/sampling_logp_difference/mean": 0.013323700055480003, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 203.578125, + "completions/mean_terminated_length": 203.578125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.45446696877479553, + "epoch": 2.1691176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02235162020305461, + "kl": 0.035784099251031876, + "learning_rate": 2.1804574476781733e-07, + "loss": 0.0003, + "num_tokens": 56049460.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5750471353530884, + "sampling/importance_sampling_ratio/mean": 0.9996752738952637, + "sampling/importance_sampling_ratio/min": 0.6301924586296082, + "sampling/sampling_logp_difference/max": 0.4617300033569336, + "sampling/sampling_logp_difference/mean": 0.016752976924180984, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 170.578125, + "completions/mean_terminated_length": 170.578125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.29969990253448486, + "epoch": 2.170343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0173527336844538, + "kl": 0.024074679240584373, + "learning_rate": 2.1745772177832755e-07, + "loss": 0.0002, + "num_tokens": 56079833.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5046863555908203, + "sampling/importance_sampling_ratio/mean": 0.9999350905418396, + "sampling/importance_sampling_ratio/min": 0.6972651481628418, + "sampling/sampling_logp_difference/max": 0.40858447551727295, + "sampling/sampling_logp_difference/mean": 0.011612225323915482, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 203.78125, + "completions/mean_terminated_length": 203.78125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.2922014594078064, + "epoch": 2.1715686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017945475451335936, + "kl": 0.023235086351633072, + "learning_rate": 2.1687027233176318e-07, + "loss": 0.0002, + "num_tokens": 56107307.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4358521699905396, + "sampling/importance_sampling_ratio/mean": 0.9999434947967529, + "sampling/importance_sampling_ratio/min": 0.5521895289421082, + "sampling/sampling_logp_difference/max": 0.5938639640808105, + "sampling/sampling_logp_difference/mean": 0.012580793350934982, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 275.578125, + "completions/mean_terminated_length": 275.578125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3100026249885559, + "epoch": 2.172794117647059, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0856831069838297, + "kl": 0.03622254356741905, + "learning_rate": 2.1628339762060914e-07, + "loss": 0.0132, + "num_tokens": 56144720.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4009132385253906, + "sampling/importance_sampling_ratio/mean": 1.000067114830017, + "sampling/importance_sampling_ratio/min": 0.6049524545669556, + "sampling/sampling_logp_difference/max": 0.5026054382324219, + "sampling/sampling_logp_difference/mean": 0.012776928022503853, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 236.984375, + "completions/mean_terminated_length": 236.984375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.37184542417526245, + "epoch": 2.174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7717181530374922, + "kl": 0.055422067642211914, + "learning_rate": 2.1569709883618382e-07, + "loss": -0.005, + "num_tokens": 56183039.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5849214792251587, + "sampling/importance_sampling_ratio/mean": 1.000169038772583, + "sampling/importance_sampling_ratio/min": 0.6072410941123962, + "sampling/sampling_logp_difference/max": 0.49882936477661133, + "sampling/sampling_logp_difference/mean": 0.014009572565555573, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 272.984375, + "completions/mean_terminated_length": 272.984375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.4569479823112488, + "epoch": 2.1752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04731987310548251, + "kl": 0.052137378603219986, + "learning_rate": 2.1511137716863687e-07, + "loss": 0.0006, + "num_tokens": 56222382.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4306907653808594, + "sampling/importance_sampling_ratio/mean": 1.0007603168487549, + "sampling/importance_sampling_ratio/min": 0.710547924041748, + "sampling/sampling_logp_difference/max": 0.3581573963165283, + "sampling/sampling_logp_difference/mean": 0.01567883789539337, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 191.78125, + "completions/mean_terminated_length": 191.78125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.39548245072364807, + "epoch": 2.176470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1297765565500506, + "kl": 0.07630042731761932, + "learning_rate": 2.1452623380694602e-07, + "loss": -0.0205, + "num_tokens": 56252000.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3939874172210693, + "sampling/importance_sampling_ratio/mean": 1.0004687309265137, + "sampling/importance_sampling_ratio/min": 0.6412686705589294, + "sampling/sampling_logp_difference/max": 0.44430673122406006, + "sampling/sampling_logp_difference/mean": 0.015118453651666641, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 238.90625, + "completions/mean_terminated_length": 238.90625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4489264190196991, + "epoch": 2.1776960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023181921215847407, + "kl": 0.04035221040248871, + "learning_rate": 2.1394166993891526e-07, + "loss": 0.0004, + "num_tokens": 56289882.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3846248388290405, + "sampling/importance_sampling_ratio/mean": 0.9996110796928406, + "sampling/importance_sampling_ratio/min": 0.6264980435371399, + "sampling/sampling_logp_difference/max": 0.4676096439361572, + "sampling/sampling_logp_difference/mean": 0.015457483008503914, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 234.4375, + "completions/mean_terminated_length": 234.4375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3741098940372467, + "epoch": 2.178921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8540488270037477, + "kl": 0.025974810123443604, + "learning_rate": 2.1335768675117205e-07, + "loss": 0.026, + "num_tokens": 56324518.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5467331409454346, + "sampling/importance_sampling_ratio/mean": 1.0000572204589844, + "sampling/importance_sampling_ratio/min": 0.6638621687889099, + "sampling/sampling_logp_difference/max": 0.4361450672149658, + "sampling/sampling_logp_difference/mean": 0.01379794254899025, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 214.828125, + "completions/mean_terminated_length": 214.828125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4446960687637329, + "epoch": 2.1801470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1561242958987044, + "kl": 0.03877747058868408, + "learning_rate": 2.1277428542916555e-07, + "loss": 0.0157, + "num_tokens": 56357611.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.46567964553833, + "sampling/importance_sampling_ratio/mean": 1.00065016746521, + "sampling/importance_sampling_ratio/min": 0.6390963792800903, + "sampling/sampling_logp_difference/max": 0.44770002365112305, + "sampling/sampling_logp_difference/mean": 0.016978032886981964, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 208.328125, + "completions/mean_terminated_length": 208.328125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.3657262325286865, + "epoch": 2.1813725490196076, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6721604929803945, + "kl": 0.03483795002102852, + "learning_rate": 2.121914671571633e-07, + "loss": 0.0192, + "num_tokens": 56385216.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6058988571166992, + "sampling/importance_sampling_ratio/mean": 1.0005840063095093, + "sampling/importance_sampling_ratio/min": 0.747340977191925, + "sampling/sampling_logp_difference/max": 0.47368359565734863, + "sampling/sampling_logp_difference/mean": 0.014650316908955574, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 207.359375, + "completions/mean_terminated_length": 207.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.36643749475479126, + "epoch": 2.1825980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.724791888276185, + "kl": 0.03202063590288162, + "learning_rate": 2.1160923311824934e-07, + "loss": 0.0275, + "num_tokens": 56416727.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.422170639038086, + "sampling/importance_sampling_ratio/mean": 0.9998961091041565, + "sampling/importance_sampling_ratio/min": 0.583415687084198, + "sampling/sampling_logp_difference/max": 0.5388553142547607, + "sampling/sampling_logp_difference/mean": 0.01394546777009964, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 170.984375, + "completions/mean_terminated_length": 170.984375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4153890907764435, + "epoch": 2.1838235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8207494487650682, + "kl": 0.06203257292509079, + "learning_rate": 2.110275844943223e-07, + "loss": 0.0188, + "num_tokens": 56442854.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3838427066802979, + "sampling/importance_sampling_ratio/mean": 0.9999013543128967, + "sampling/importance_sampling_ratio/min": 0.6164276003837585, + "sampling/sampling_logp_difference/max": 0.4838144779205322, + "sampling/sampling_logp_difference/mean": 0.0162960272282362, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 153.390625, + "completions/mean_terminated_length": 153.390625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.2608410120010376, + "epoch": 2.185049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020464400904348198, + "kl": 0.026761263608932495, + "learning_rate": 2.1044652246609173e-07, + "loss": 0.0003, + "num_tokens": 56465183.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4588507413864136, + "sampling/importance_sampling_ratio/mean": 1.0002644062042236, + "sampling/importance_sampling_ratio/min": 0.6259872913360596, + "sampling/sampling_logp_difference/max": 0.46842527389526367, + "sampling/sampling_logp_difference/mean": 0.012925646267831326, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 211.484375, + "completions/mean_terminated_length": 211.484375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.41743314266204834, + "epoch": 2.186274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7483264520808547, + "kl": 0.06146371364593506, + "learning_rate": 2.098660482130768e-07, + "loss": -0.0007, + "num_tokens": 56491886.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.4583240747451782, + "sampling/importance_sampling_ratio/mean": 0.9997591972351074, + "sampling/importance_sampling_ratio/min": 0.6400062441825867, + "sampling/sampling_logp_difference/max": 0.446277379989624, + "sampling/sampling_logp_difference/mean": 0.015290914103388786, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 163.515625, + "completions/mean_terminated_length": 163.515625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.43135976791381836, + "epoch": 2.1875, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0479395928065414, + "kl": 0.053393132984638214, + "learning_rate": 2.092861629136033e-07, + "loss": -0.0004, + "num_tokens": 56518863.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5758846998214722, + "sampling/importance_sampling_ratio/mean": 1.0003823041915894, + "sampling/importance_sampling_ratio/min": 0.5807136297225952, + "sampling/sampling_logp_difference/max": 0.5434975624084473, + "sampling/sampling_logp_difference/mean": 0.01628931611776352, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 189.9375, + "completions/mean_terminated_length": 189.9375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.43314477801322937, + "epoch": 2.188725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039987186921131096, + "kl": 0.04892154783010483, + "learning_rate": 2.0870686774480196e-07, + "loss": 0.0005, + "num_tokens": 56548571.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6191405057907104, + "sampling/importance_sampling_ratio/mean": 1.0004253387451172, + "sampling/importance_sampling_ratio/min": 0.6203228235244751, + "sampling/sampling_logp_difference/max": 0.48189544677734375, + "sampling/sampling_logp_difference/mean": 0.01644906960427761, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 237.984375, + "completions/mean_terminated_length": 237.984375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3843547999858856, + "epoch": 2.189950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6781784825639222, + "kl": 0.02506193518638611, + "learning_rate": 2.0812816388260519e-07, + "loss": 0.0216, + "num_tokens": 56584634.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003893375396729, + "sampling/importance_sampling_ratio/min": 0.642171323299408, + "sampling/sampling_logp_difference/max": 0.709916353225708, + "sampling/sampling_logp_difference/mean": 0.013689187355339527, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 240.453125, + "completions/mean_terminated_length": 240.453125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.44558557868003845, + "epoch": 2.1911764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8566102775947064, + "kl": 0.04730532318353653, + "learning_rate": 2.0755005250174484e-07, + "loss": 0.0261, + "num_tokens": 56618487.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6304231882095337, + "sampling/importance_sampling_ratio/mean": 1.0002212524414062, + "sampling/importance_sampling_ratio/min": 0.6077031493186951, + "sampling/sampling_logp_difference/max": 0.49806880950927734, + "sampling/sampling_logp_difference/mean": 0.016564399003982544, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 199.84375, + "completions/mean_terminated_length": 199.84375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4355708062648773, + "epoch": 2.1924019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1387781585557262, + "kl": 0.12204273045063019, + "learning_rate": 2.0697253477575088e-07, + "loss": 0.0301, + "num_tokens": 56643789.0, + "reward": 0.21875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.9426008462905884, + "sampling/importance_sampling_ratio/mean": 0.9996011853218079, + "sampling/importance_sampling_ratio/min": 0.49107638001441956, + "sampling/sampling_logp_difference/max": 0.7111556529998779, + "sampling/sampling_logp_difference/mean": 0.016524486243724823, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 246.515625, + "completions/mean_terminated_length": 246.515625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.42478808760643005, + "epoch": 2.1936274509803924, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9592026126096747, + "kl": 0.03127903491258621, + "learning_rate": 2.0639561187694733e-07, + "loss": -0.0021, + "num_tokens": 56675310.0, + "reward": 0.0625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5297869443893433, + "sampling/importance_sampling_ratio/mean": 0.9999357461929321, + "sampling/importance_sampling_ratio/min": 0.6471536159515381, + "sampling/sampling_logp_difference/max": 0.43517160415649414, + "sampling/sampling_logp_difference/mean": 0.01433703675866127, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 220.140625, + "completions/mean_terminated_length": 220.140625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.44690874218940735, + "epoch": 2.1948529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7668036451017813, + "kl": 0.03841719776391983, + "learning_rate": 2.0581928497645164e-07, + "loss": -0.0074, + "num_tokens": 56707623.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.658127784729004, + "sampling/importance_sampling_ratio/mean": 1.000002384185791, + "sampling/importance_sampling_ratio/min": 0.6332475543022156, + "sampling/sampling_logp_difference/max": 0.5056891441345215, + "sampling/sampling_logp_difference/mean": 0.01577058434486389, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 182.109375, + "completions/mean_terminated_length": 182.109375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.406405508518219, + "epoch": 2.196078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7276034273699343, + "kl": 0.03541386127471924, + "learning_rate": 2.0524355524417015e-07, + "loss": 0.0113, + "num_tokens": 56737166.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.658131718635559, + "sampling/importance_sampling_ratio/mean": 0.9990932941436768, + "sampling/importance_sampling_ratio/min": 0.637708842754364, + "sampling/sampling_logp_difference/max": 0.5056915283203125, + "sampling/sampling_logp_difference/mean": 0.01556492131203413, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 201.359375, + "completions/mean_terminated_length": 201.359375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4390181303024292, + "epoch": 2.1973039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021430027985534342, + "kl": 0.03035741113126278, + "learning_rate": 2.0466842384879829e-07, + "loss": 0.0003, + "num_tokens": 56767061.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4529781341552734, + "sampling/importance_sampling_ratio/mean": 0.9994826316833496, + "sampling/importance_sampling_ratio/min": 0.7436936497688293, + "sampling/sampling_logp_difference/max": 0.3736152648925781, + "sampling/sampling_logp_difference/mean": 0.015260099433362484, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 266.5625, + "completions/mean_terminated_length": 266.5625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.3280014395713806, + "epoch": 2.198529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015950563319099935, + "kl": 0.03242075443267822, + "learning_rate": 2.0409389195781623e-07, + "loss": 0.0003, + "num_tokens": 56802265.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5215299129486084, + "sampling/importance_sampling_ratio/mean": 1.0002336502075195, + "sampling/importance_sampling_ratio/min": 0.6139700412750244, + "sampling/sampling_logp_difference/max": 0.4878091812133789, + "sampling/sampling_logp_difference/mean": 0.01232210174202919, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 186.78125, + "completions/mean_terminated_length": 186.78125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.40459394454956055, + "epoch": 2.1997549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01660475756175428, + "kl": 0.027100378647446632, + "learning_rate": 2.0351996073748713e-07, + "loss": 0.0003, + "num_tokens": 56835403.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4000202417373657, + "sampling/importance_sampling_ratio/mean": 1.0003581047058105, + "sampling/importance_sampling_ratio/min": 0.6682443022727966, + "sampling/sampling_logp_difference/max": 0.40310144424438477, + "sampling/sampling_logp_difference/mean": 0.016592102125287056, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 271.796875, + "completions/mean_terminated_length": 271.796875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.386542409658432, + "epoch": 2.200980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4588857037949476, + "kl": 0.027792006731033325, + "learning_rate": 2.0294663135285533e-07, + "loss": 0.0541, + "num_tokens": 56871982.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.480009913444519, + "sampling/importance_sampling_ratio/mean": 1.000190019607544, + "sampling/importance_sampling_ratio/min": 0.6394152045249939, + "sampling/sampling_logp_difference/max": 0.44720131158828735, + "sampling/sampling_logp_difference/mean": 0.013207310810685158, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 177.984375, + "completions/mean_terminated_length": 177.984375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4077008366584778, + "epoch": 2.202205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0073105546962986, + "kl": 0.061981189996004105, + "learning_rate": 2.0237390496774282e-07, + "loss": -0.0217, + "num_tokens": 56897805.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.3990243673324585, + "sampling/importance_sampling_ratio/mean": 0.9989504814147949, + "sampling/importance_sampling_ratio/min": 0.6263276934623718, + "sampling/sampling_logp_difference/max": 0.4678816795349121, + "sampling/sampling_logp_difference/mean": 0.016141217201948166, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 186.890625, + "completions/mean_terminated_length": 186.890625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3266540765762329, + "epoch": 2.2034313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01935019162071562, + "kl": 0.028891902416944504, + "learning_rate": 2.0180178274474834e-07, + "loss": 0.0003, + "num_tokens": 56931046.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.600102186203003, + "sampling/importance_sampling_ratio/mean": 0.9996902942657471, + "sampling/importance_sampling_ratio/min": 0.6493139266967773, + "sampling/sampling_logp_difference/max": 0.47006750106811523, + "sampling/sampling_logp_difference/mean": 0.01283689122647047, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 207.203125, + "completions/mean_terminated_length": 207.203125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4144088625907898, + "epoch": 2.204656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020018461654744822, + "kl": 0.02925487793982029, + "learning_rate": 2.012302658452432e-07, + "loss": 0.0003, + "num_tokens": 56959715.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9894824028015137, + "sampling/importance_sampling_ratio/mean": 1.000123143196106, + "sampling/importance_sampling_ratio/min": 0.7063222527503967, + "sampling/sampling_logp_difference/max": 0.6878745555877686, + "sampling/sampling_logp_difference/mean": 0.01610928401350975, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 163.875, + "completions/mean_terminated_length": 163.875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.42290157079696655, + "epoch": 2.2058823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01865298245981651, + "kl": 0.031292933970689774, + "learning_rate": 2.0065935542937073e-07, + "loss": 0.0003, + "num_tokens": 56987131.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4991945028305054, + "sampling/importance_sampling_ratio/mean": 1.0002739429473877, + "sampling/importance_sampling_ratio/min": 0.6922261118888855, + "sampling/sampling_logp_difference/max": 0.40492796897888184, + "sampling/sampling_logp_difference/mean": 0.017201673239469528, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 200.265625, + "completions/mean_terminated_length": 200.265625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.38739973306655884, + "epoch": 2.207107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01689581843543527, + "kl": 0.024199094623327255, + "learning_rate": 2.0008905265604315e-07, + "loss": 0.0002, + "num_tokens": 57019180.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5504204034805298, + "sampling/importance_sampling_ratio/mean": 0.9998184442520142, + "sampling/importance_sampling_ratio/min": 0.6028806567192078, + "sampling/sampling_logp_difference/max": 0.5060360431671143, + "sampling/sampling_logp_difference/mean": 0.015314217656850815, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 177.765625, + "completions/mean_terminated_length": 177.765625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3768065869808197, + "epoch": 2.2083333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02137046244544593, + "kl": 0.03218837454915047, + "learning_rate": 1.995193586829387e-07, + "loss": 0.0003, + "num_tokens": 57044493.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4296340942382812, + "sampling/importance_sampling_ratio/mean": 0.9995778799057007, + "sampling/importance_sampling_ratio/min": 0.6254597306251526, + "sampling/sampling_logp_difference/max": 0.4692683219909668, + "sampling/sampling_logp_difference/mean": 0.015134826302528381, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 215.78125, + "completions/mean_terminated_length": 215.78125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.41760993003845215, + "epoch": 2.2095588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8668926560172997, + "kl": 0.027726545929908752, + "learning_rate": 1.989502746665001e-07, + "loss": -0.0114, + "num_tokens": 57073247.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5832754373550415, + "sampling/importance_sampling_ratio/mean": 0.999932587146759, + "sampling/importance_sampling_ratio/min": 0.6758054494857788, + "sampling/sampling_logp_difference/max": 0.45949578285217285, + "sampling/sampling_logp_difference/mean": 0.014503438025712967, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 168.203125, + "completions/mean_terminated_length": 168.203125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.33378592133522034, + "epoch": 2.2107843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017364213449427, + "kl": 0.02794494479894638, + "learning_rate": 1.9838180176193176e-07, + "loss": 0.0003, + "num_tokens": 57111532.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8474539518356323, + "sampling/importance_sampling_ratio/mean": 1.0004229545593262, + "sampling/importance_sampling_ratio/min": 0.6781023144721985, + "sampling/sampling_logp_difference/max": 0.6138083934783936, + "sampling/sampling_logp_difference/mean": 0.013987618498504162, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 166.203125, + "completions/mean_terminated_length": 166.203125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3480472266674042, + "epoch": 2.2120098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017573714854610165, + "kl": 0.02866465598344803, + "learning_rate": 1.9781394112319787e-07, + "loss": 0.0003, + "num_tokens": 57136569.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6249321699142456, + "sampling/importance_sampling_ratio/mean": 1.0000239610671997, + "sampling/importance_sampling_ratio/min": 0.6299121379852295, + "sampling/sampling_logp_difference/max": 0.48546600341796875, + "sampling/sampling_logp_difference/mean": 0.01445926446467638, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 232.640625, + "completions/mean_terminated_length": 232.640625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.46496593952178955, + "epoch": 2.213235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6456536278620773, + "kl": 0.03583163022994995, + "learning_rate": 1.9724669390301946e-07, + "loss": -0.0097, + "num_tokens": 57173954.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4635348320007324, + "sampling/importance_sampling_ratio/mean": 1.0003316402435303, + "sampling/importance_sampling_ratio/min": 0.6590652465820312, + "sampling/sampling_logp_difference/max": 0.41693270206451416, + "sampling/sampling_logp_difference/mean": 0.015367222018539906, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 187.4375, + "completions/mean_terminated_length": 187.4375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.36628732085227966, + "epoch": 2.2144607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8709914404044724, + "kl": 0.038709282875061035, + "learning_rate": 1.9668006125287228e-07, + "loss": -0.0025, + "num_tokens": 57200974.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5865304470062256, + "sampling/importance_sampling_ratio/mean": 1.0000580549240112, + "sampling/importance_sampling_ratio/min": 0.6299977898597717, + "sampling/sampling_logp_difference/max": 0.4620389938354492, + "sampling/sampling_logp_difference/mean": 0.015021628700196743, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 221.609375, + "completions/mean_terminated_length": 221.609375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.35634899139404297, + "epoch": 2.215686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016042609747101415, + "kl": 0.028308294713497162, + "learning_rate": 1.96114044322985e-07, + "loss": 0.0003, + "num_tokens": 57229925.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6276687383651733, + "sampling/importance_sampling_ratio/mean": 0.9993330240249634, + "sampling/importance_sampling_ratio/min": 0.6147381663322449, + "sampling/sampling_logp_difference/max": 0.4871487617492676, + "sampling/sampling_logp_difference/mean": 0.01397157832980156, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 180.34375, + "completions/mean_terminated_length": 180.34375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.35287073254585266, + "epoch": 2.2169117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022584617604283003, + "kl": 0.030864350497722626, + "learning_rate": 1.9554864426233604e-07, + "loss": 0.0003, + "num_tokens": 57255003.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7157540321350098, + "sampling/importance_sampling_ratio/mean": 0.9992146492004395, + "sampling/importance_sampling_ratio/min": 0.32864677906036377, + "sampling/sampling_logp_difference/max": 1.11277174949646, + "sampling/sampling_logp_difference/mean": 0.015081110410392284, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 130.265625, + "completions/mean_terminated_length": 130.265625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.345086932182312, + "epoch": 2.218137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.122770356762049, + "kl": 0.06593969464302063, + "learning_rate": 1.9498386221865165e-07, + "loss": 0.0098, + "num_tokens": 57276764.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.3267107009887695, + "sampling/importance_sampling_ratio/mean": 1.0004007816314697, + "sampling/importance_sampling_ratio/min": 0.7094165086746216, + "sampling/sampling_logp_difference/max": 0.34331250190734863, + "sampling/sampling_logp_difference/mean": 0.015385487116873264, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 166.328125, + "completions/mean_terminated_length": 166.328125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3845037519931793, + "epoch": 2.219362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027420821102334385, + "kl": 0.05962613224983215, + "learning_rate": 1.944196993384034e-07, + "loss": 0.0006, + "num_tokens": 57310689.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002989768981934, + "sampling/importance_sampling_ratio/min": 0.548373281955719, + "sampling/sampling_logp_difference/max": 0.7172539234161377, + "sampling/sampling_logp_difference/mean": 0.016318058595061302, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 183.90625, + "completions/mean_terminated_length": 183.90625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3913060128688812, + "epoch": 2.2205882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0367113657543623, + "kl": 0.02648971602320671, + "learning_rate": 1.9385615676680661e-07, + "loss": 0.0149, + "num_tokens": 57339099.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4023281335830688, + "sampling/importance_sampling_ratio/mean": 1.0001801252365112, + "sampling/importance_sampling_ratio/min": 0.5448763370513916, + "sampling/sampling_logp_difference/max": 0.6071963310241699, + "sampling/sampling_logp_difference/mean": 0.015552809461951256, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 203.96875, + "completions/mean_terminated_length": 203.96875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3784450888633728, + "epoch": 2.221813725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0020648254354139, + "kl": 0.06395342946052551, + "learning_rate": 1.932932356478168e-07, + "loss": 0.0085, + "num_tokens": 57368041.0, + "reward": 0.125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.6509145498275757, + "sampling/importance_sampling_ratio/mean": 1.0001643896102905, + "sampling/importance_sampling_ratio/min": 0.7201745510101318, + "sampling/sampling_logp_difference/max": 0.5013294219970703, + "sampling/sampling_logp_difference/mean": 0.013813063502311707, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 204.03125, + "completions/mean_terminated_length": 204.03125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.42454639077186584, + "epoch": 2.2230392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0508410085374955, + "kl": 0.03902792930603027, + "learning_rate": 1.9273093712412796e-07, + "loss": 0.0388, + "num_tokens": 57398795.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.618973731994629, + "sampling/importance_sampling_ratio/mean": 0.9997764825820923, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4817924499511719, + "sampling/sampling_logp_difference/mean": 0.017444204539060593, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 178.390625, + "completions/mean_terminated_length": 178.390625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.35003480315208435, + "epoch": 2.224264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02101046059494195, + "kl": 0.02411719411611557, + "learning_rate": 1.9216926233717084e-07, + "loss": 0.0002, + "num_tokens": 57429876.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9010097980499268, + "sampling/importance_sampling_ratio/mean": 0.9996275901794434, + "sampling/importance_sampling_ratio/min": 0.6467365026473999, + "sampling/sampling_logp_difference/max": 0.6423852443695068, + "sampling/sampling_logp_difference/mean": 0.015089732594788074, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 194.859375, + "completions/mean_terminated_length": 194.859375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.37340590357780457, + "epoch": 2.2254901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.999134269824642, + "kl": 0.046641554683446884, + "learning_rate": 1.9160821242710957e-07, + "loss": -0.0064, + "num_tokens": 57458443.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.603572964668274, + "sampling/importance_sampling_ratio/mean": 1.000485897064209, + "sampling/importance_sampling_ratio/min": 0.6118009686470032, + "sampling/sampling_logp_difference/max": 0.4913482666015625, + "sampling/sampling_logp_difference/mean": 0.014087516814470291, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 231.40625, + "completions/mean_terminated_length": 231.40625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.33101558685302734, + "epoch": 2.2267156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019865124744391862, + "kl": 0.03153214231133461, + "learning_rate": 1.9104778853283987e-07, + "loss": 0.0003, + "num_tokens": 57490725.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6230480670928955, + "sampling/importance_sampling_ratio/mean": 1.0005909204483032, + "sampling/importance_sampling_ratio/min": 0.6489548087120056, + "sampling/sampling_logp_difference/max": 0.48430585861206055, + "sampling/sampling_logp_difference/mean": 0.013317778706550598, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 206.546875, + "completions/mean_terminated_length": 206.546875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4915628433227539, + "epoch": 2.2279411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1199918148309598, + "kl": 0.08405325561761856, + "learning_rate": 1.9048799179198655e-07, + "loss": 0.0123, + "num_tokens": 57517768.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.5523357391357422, + "sampling/importance_sampling_ratio/mean": 1.000276803970337, + "sampling/importance_sampling_ratio/min": 0.6532287001609802, + "sampling/sampling_logp_difference/max": 0.439760684967041, + "sampling/sampling_logp_difference/mean": 0.017245426774024963, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 173.9375, + "completions/mean_terminated_length": 173.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2633485794067383, + "epoch": 2.2291666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023926006425631696, + "kl": 0.031033962965011597, + "learning_rate": 1.8992882334090188e-07, + "loss": 0.0003, + "num_tokens": 57543588.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004593133926392, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.8591389656066895, + "sampling/sampling_logp_difference/mean": 0.012853178195655346, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 222.265625, + "completions/mean_terminated_length": 222.265625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3312186300754547, + "epoch": 2.230392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7378282345857047, + "kl": 0.020720241591334343, + "learning_rate": 1.893702843146623e-07, + "loss": 0.0373, + "num_tokens": 57578037.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006163120269775, + "sampling/importance_sampling_ratio/min": 0.379047691822052, + "sampling/sampling_logp_difference/max": 0.9700932502746582, + "sampling/sampling_logp_difference/mean": 0.012796593829989433, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 194.859375, + "completions/mean_terminated_length": 194.859375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3865714967250824, + "epoch": 2.2316176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019562472786731268, + "kl": 0.031382933259010315, + "learning_rate": 1.8881237584706632e-07, + "loss": 0.0003, + "num_tokens": 57606924.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3419557809829712, + "sampling/importance_sampling_ratio/mean": 0.9994385838508606, + "sampling/importance_sampling_ratio/min": 0.6411448121070862, + "sampling/sampling_logp_difference/max": 0.4444999694824219, + "sampling/sampling_logp_difference/mean": 0.01487137284129858, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 193.671875, + "completions/mean_terminated_length": 193.671875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.38141852617263794, + "epoch": 2.232843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02168403105048124, + "kl": 0.025620419532060623, + "learning_rate": 1.8825509907063326e-07, + "loss": 0.0002, + "num_tokens": 57634967.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4173139333724976, + "sampling/importance_sampling_ratio/mean": 1.000117301940918, + "sampling/importance_sampling_ratio/min": 0.6606555581092834, + "sampling/sampling_logp_difference/max": 0.414522647857666, + "sampling/sampling_logp_difference/mean": 0.014868944883346558, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 221.90625, + "completions/mean_terminated_length": 221.90625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.379791796207428, + "epoch": 2.2340686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9694196703380831, + "kl": 0.037543851882219315, + "learning_rate": 1.8769845511659927e-07, + "loss": -0.0104, + "num_tokens": 57667425.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4553523063659668, + "sampling/importance_sampling_ratio/mean": 1.0001063346862793, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4679851531982422, + "sampling/sampling_logp_difference/mean": 0.014764810912311077, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 177.5625, + "completions/mean_terminated_length": 177.5625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.36093395948410034, + "epoch": 2.235294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018977386129609426, + "kl": 0.027511999011039734, + "learning_rate": 1.871424451149169e-07, + "loss": 0.0003, + "num_tokens": 57696581.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4059081077575684, + "sampling/importance_sampling_ratio/mean": 0.999370276927948, + "sampling/importance_sampling_ratio/min": 0.7365430593490601, + "sampling/sampling_logp_difference/max": 0.3406834602355957, + "sampling/sampling_logp_difference/mean": 0.013832824304699898, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 255.015625, + "completions/mean_terminated_length": 255.015625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.43517765402793884, + "epoch": 2.236519607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9632358977010006, + "kl": 0.06032560020685196, + "learning_rate": 1.865870701942504e-07, + "loss": -0.0186, + "num_tokens": 57735958.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000049114227295, + "sampling/importance_sampling_ratio/min": 0.6070228219032288, + "sampling/sampling_logp_difference/max": 0.9196538925170898, + "sampling/sampling_logp_difference/mean": 0.0153459832072258, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 243.25, + "completions/mean_terminated_length": 243.25, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.370273232460022, + "epoch": 2.2377450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7517804749863046, + "kl": 0.04560757055878639, + "learning_rate": 1.8603233148197632e-07, + "loss": 0.0059, + "num_tokens": 57770118.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999024868011475, + "sampling/importance_sampling_ratio/min": 0.6707372069358826, + "sampling/sampling_logp_difference/max": 0.8612053394317627, + "sampling/sampling_logp_difference/mean": 0.013436258770525455, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 175.234375, + "completions/mean_terminated_length": 175.234375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.37318986654281616, + "epoch": 2.238970588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9467269096497812, + "kl": 0.04705360531806946, + "learning_rate": 1.8547823010417873e-07, + "loss": 0.0136, + "num_tokens": 57795829.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3680918216705322, + "sampling/importance_sampling_ratio/mean": 1.0007050037384033, + "sampling/importance_sampling_ratio/min": 0.6918064951896667, + "sampling/sampling_logp_difference/max": 0.36844897270202637, + "sampling/sampling_logp_difference/mean": 0.014947384595870972, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 203.296875, + "completions/mean_terminated_length": 203.296875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4300217032432556, + "epoch": 2.2401960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032675433367653266, + "kl": 0.05343396216630936, + "learning_rate": 1.8492476718564866e-07, + "loss": 0.0005, + "num_tokens": 57827912.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.546739101409912, + "sampling/importance_sampling_ratio/mean": 1.0007200241088867, + "sampling/importance_sampling_ratio/min": 0.6262629628181458, + "sampling/sampling_logp_difference/max": 0.4679849147796631, + "sampling/sampling_logp_difference/mean": 0.015370909124612808, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 228.3125, + "completions/mean_terminated_length": 228.3125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.40327417850494385, + "epoch": 2.241421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.732180587909517, + "kl": 0.043199148029088974, + "learning_rate": 1.8437194384988058e-07, + "loss": 0.0297, + "num_tokens": 57860476.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4032870531082153, + "sampling/importance_sampling_ratio/mean": 0.9995312094688416, + "sampling/importance_sampling_ratio/min": 0.6483083367347717, + "sampling/sampling_logp_difference/max": 0.4333888292312622, + "sampling/sampling_logp_difference/mean": 0.014346604235470295, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 151.515625, + "completions/mean_terminated_length": 151.515625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3040603995323181, + "epoch": 2.2426470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028195815879549224, + "kl": 0.03291485831141472, + "learning_rate": 1.8381976121907067e-07, + "loss": 0.0003, + "num_tokens": 57885885.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4264153242111206, + "sampling/importance_sampling_ratio/mean": 0.9997786283493042, + "sampling/importance_sampling_ratio/min": 0.6171460747718811, + "sampling/sampling_logp_difference/max": 0.482649564743042, + "sampling/sampling_logp_difference/mean": 0.01339528989046812, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 205.921875, + "completions/mean_terminated_length": 205.921875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3580760657787323, + "epoch": 2.2438725490196076, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01819422238485712, + "kl": 0.02745627611875534, + "learning_rate": 1.832682204141152e-07, + "loss": 0.0003, + "num_tokens": 57917576.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.507186770439148, + "sampling/importance_sampling_ratio/mean": 1.0001232624053955, + "sampling/importance_sampling_ratio/min": 0.6610642671585083, + "sampling/sampling_logp_difference/max": 0.41390419006347656, + "sampling/sampling_logp_difference/mean": 0.013558096252381802, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 177.25, + "completions/mean_terminated_length": 177.25, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3429805040359497, + "epoch": 2.2450980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.770708486626644, + "kl": 0.05356878042221069, + "learning_rate": 1.8271732255460643e-07, + "loss": 0.0009, + "num_tokens": 57946440.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6120491027832031, + "sampling/importance_sampling_ratio/mean": 1.0000370740890503, + "sampling/importance_sampling_ratio/min": 0.6269176006317139, + "sampling/sampling_logp_difference/max": 0.477506160736084, + "sampling/sampling_logp_difference/mean": 0.013620387762784958, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 210.484375, + "completions/mean_terminated_length": 210.484375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.40086644887924194, + "epoch": 2.2463235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023943640421089363, + "kl": 0.026479825377464294, + "learning_rate": 1.8216706875883252e-07, + "loss": 0.0003, + "num_tokens": 57976967.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.421567440032959, + "sampling/importance_sampling_ratio/mean": 1.0002470016479492, + "sampling/importance_sampling_ratio/min": 0.7061286568641663, + "sampling/sampling_logp_difference/max": 0.3517601490020752, + "sampling/sampling_logp_difference/mean": 0.014774276874959469, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 196.28125, + "completions/mean_terminated_length": 196.28125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.34230923652648926, + "epoch": 2.247549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014822602203528625, + "kl": 0.02011169120669365, + "learning_rate": 1.816174601437736e-07, + "loss": 0.0002, + "num_tokens": 58008969.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.449621319770813, + "sampling/importance_sampling_ratio/mean": 1.000059723854065, + "sampling/importance_sampling_ratio/min": 0.582772433757782, + "sampling/sampling_logp_difference/max": 0.5399584770202637, + "sampling/sampling_logp_difference/mean": 0.014088565483689308, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 234.609375, + "completions/mean_terminated_length": 234.609375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.5302902460098267, + "epoch": 2.248774509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1187976558045092, + "kl": 0.0506209135055542, + "learning_rate": 1.8106849782510058e-07, + "loss": 0.0394, + "num_tokens": 58041856.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997063875198364, + "sampling/importance_sampling_ratio/min": 0.48501065373420715, + "sampling/sampling_logp_difference/max": 0.9161038398742676, + "sampling/sampling_logp_difference/mean": 0.01879747584462166, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 200.515625, + "completions/mean_terminated_length": 200.515625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.4087119996547699, + "epoch": 2.25, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04229656470672519, + "kl": 0.03463684767484665, + "learning_rate": 1.8052018291717215e-07, + "loss": 0.0003, + "num_tokens": 58078033.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005056858062744, + "sampling/importance_sampling_ratio/min": 0.5393726825714111, + "sampling/sampling_logp_difference/max": 0.754511833190918, + "sampling/sampling_logp_difference/mean": 0.016497399657964706, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 179.328125, + "completions/mean_terminated_length": 179.328125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3928197920322418, + "epoch": 2.251225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9492877853035604, + "kl": 0.03979340195655823, + "learning_rate": 1.7997251653303247e-07, + "loss": -0.0348, + "num_tokens": 58110102.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.559341549873352, + "sampling/importance_sampling_ratio/mean": 1.0000081062316895, + "sampling/importance_sampling_ratio/min": 0.6039516925811768, + "sampling/sampling_logp_difference/max": 0.5042610168457031, + "sampling/sampling_logp_difference/mean": 0.015556180849671364, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 183.671875, + "completions/mean_terminated_length": 183.671875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.35463041067123413, + "epoch": 2.252450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8284520374198466, + "kl": 0.04506409540772438, + "learning_rate": 1.7942549978441012e-07, + "loss": 0.0121, + "num_tokens": 58140977.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5665878057479858, + "sampling/importance_sampling_ratio/mean": 1.0000871419906616, + "sampling/importance_sampling_ratio/min": 0.6382682919502258, + "sampling/sampling_logp_difference/max": 0.44899654388427734, + "sampling/sampling_logp_difference/mean": 0.014326345175504684, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 170.34375, + "completions/mean_terminated_length": 170.34375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.35513296723365784, + "epoch": 2.2536764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02140885507326921, + "kl": 0.0316675566136837, + "learning_rate": 1.7887913378171422e-07, + "loss": 0.0003, + "num_tokens": 58166695.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5436952114105225, + "sampling/importance_sampling_ratio/mean": 1.0004980564117432, + "sampling/importance_sampling_ratio/min": 0.6534729599952698, + "sampling/sampling_logp_difference/max": 0.43417906761169434, + "sampling/sampling_logp_difference/mean": 0.015069067478179932, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 219.03125, + "completions/mean_terminated_length": 219.03125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.31455889344215393, + "epoch": 2.2549019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016787786466763342, + "kl": 0.02507929503917694, + "learning_rate": 1.783334196340331e-07, + "loss": 0.0003, + "num_tokens": 58199289.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5739880800247192, + "sampling/importance_sampling_ratio/mean": 1.0001616477966309, + "sampling/importance_sampling_ratio/min": 0.6265933513641357, + "sampling/sampling_logp_difference/max": 0.46745753288269043, + "sampling/sampling_logp_difference/mean": 0.012406616471707821, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 169.953125, + "completions/mean_terminated_length": 169.953125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2821255326271057, + "epoch": 2.256127450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017764456952031223, + "kl": 0.028326895087957382, + "learning_rate": 1.777883584491317e-07, + "loss": 0.0003, + "num_tokens": 58224102.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.586036205291748, + "sampling/importance_sampling_ratio/mean": 0.999875545501709, + "sampling/importance_sampling_ratio/min": 0.6367641091346741, + "sampling/sampling_logp_difference/max": 0.46123790740966797, + "sampling/sampling_logp_difference/mean": 0.012191805988550186, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 174.234375, + "completions/mean_terminated_length": 174.234375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.34605711698532104, + "epoch": 2.2573529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019113516655231564, + "kl": 0.030208708718419075, + "learning_rate": 1.7724395133345022e-07, + "loss": 0.0003, + "num_tokens": 58255989.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6088987588882446, + "sampling/importance_sampling_ratio/mean": 1.0002429485321045, + "sampling/importance_sampling_ratio/min": 0.6771940588951111, + "sampling/sampling_logp_difference/max": 0.47554993629455566, + "sampling/sampling_logp_difference/mean": 0.013957983814179897, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 151.796875, + "completions/mean_terminated_length": 151.796875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3741613030433655, + "epoch": 2.258578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0317385444015215, + "kl": 0.0608445405960083, + "learning_rate": 1.7670019939210023e-07, + "loss": 0.0006, + "num_tokens": 58281800.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5932728052139282, + "sampling/importance_sampling_ratio/mean": 1.0003043413162231, + "sampling/importance_sampling_ratio/min": 0.6771954894065857, + "sampling/sampling_logp_difference/max": 0.4657902717590332, + "sampling/sampling_logp_difference/mean": 0.015265490859746933, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 155.5, + "completions/mean_terminated_length": 155.5, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.37429383397102356, + "epoch": 2.2598039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.897473156153675, + "kl": 0.04108111187815666, + "learning_rate": 1.761571037288637e-07, + "loss": -0.0059, + "num_tokens": 58306072.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.2885618209838867, + "sampling/importance_sampling_ratio/mean": 1.0001215934753418, + "sampling/importance_sampling_ratio/min": 0.6546775102615356, + "sampling/sampling_logp_difference/max": 0.42361247539520264, + "sampling/sampling_logp_difference/mean": 0.01529126986861229, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 199.453125, + "completions/mean_terminated_length": 199.453125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.4133285880088806, + "epoch": 2.261029411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8305264882597939, + "kl": 0.046958938241004944, + "learning_rate": 1.7561466544619076e-07, + "loss": 0.0054, + "num_tokens": 58342485.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4241420030593872, + "sampling/importance_sampling_ratio/mean": 0.9996469020843506, + "sampling/importance_sampling_ratio/min": 0.698478102684021, + "sampling/sampling_logp_difference/max": 0.35885143280029297, + "sampling/sampling_logp_difference/mean": 0.015273008495569229, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 148.453125, + "completions/mean_terminated_length": 148.453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.30733948945999146, + "epoch": 2.2622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025645417825013117, + "kl": 0.029887264594435692, + "learning_rate": 1.7507288564519646e-07, + "loss": 0.0003, + "num_tokens": 58366018.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5554358959197998, + "sampling/importance_sampling_ratio/mean": 0.9999924302101135, + "sampling/importance_sampling_ratio/min": 0.6394727230072021, + "sampling/sampling_logp_difference/max": 0.4471113681793213, + "sampling/sampling_logp_difference/mean": 0.014675739221274853, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 247.65625, + "completions/mean_terminated_length": 247.65625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.46249935030937195, + "epoch": 2.263480392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029007902574953028, + "kl": 0.04160034656524658, + "learning_rate": 1.7453176542565956e-07, + "loss": 0.0004, + "num_tokens": 58406588.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4351998567581177, + "sampling/importance_sampling_ratio/mean": 0.9995511770248413, + "sampling/importance_sampling_ratio/min": 0.6306003332138062, + "sampling/sampling_logp_difference/max": 0.46108293533325195, + "sampling/sampling_logp_difference/mean": 0.015138974413275719, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 208.15625, + "completions/mean_terminated_length": 208.15625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4643312692642212, + "epoch": 2.264705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023178761766796327, + "kl": 0.033480897545814514, + "learning_rate": 1.7399130588601968e-07, + "loss": 0.0003, + "num_tokens": 58445270.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7730165719985962, + "sampling/importance_sampling_ratio/mean": 1.0004382133483887, + "sampling/importance_sampling_ratio/min": 0.62339186668396, + "sampling/sampling_logp_difference/max": 0.5726823806762695, + "sampling/sampling_logp_difference/mean": 0.01806674152612686, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 204.0625, + "completions/mean_terminated_length": 204.0625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3384186625480652, + "epoch": 2.2659313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015163834042729219, + "kl": 0.02468809485435486, + "learning_rate": 1.7345150812337562e-07, + "loss": 0.0002, + "num_tokens": 58476682.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4060077667236328, + "sampling/importance_sampling_ratio/mean": 0.9995001554489136, + "sampling/importance_sampling_ratio/min": 0.6893008947372437, + "sampling/sampling_logp_difference/max": 0.37207746505737305, + "sampling/sampling_logp_difference/mean": 0.013668473809957504, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 236.703125, + "completions/mean_terminated_length": 236.703125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.36269164085388184, + "epoch": 2.267156862745098, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0564679752562487, + "kl": 0.03552190959453583, + "learning_rate": 1.7291237323348284e-07, + "loss": 0.053, + "num_tokens": 58507943.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5277749300003052, + "sampling/importance_sampling_ratio/mean": 0.9997976422309875, + "sampling/importance_sampling_ratio/min": 0.642673909664154, + "sampling/sampling_logp_difference/max": 0.4421178102493286, + "sampling/sampling_logp_difference/mean": 0.013543002307415009, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 191.171875, + "completions/mean_terminated_length": 191.171875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.4077395796775818, + "epoch": 2.2683823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017419935750676126, + "kl": 0.028113462030887604, + "learning_rate": 1.7237390231075055e-07, + "loss": 0.0003, + "num_tokens": 58543410.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.610731840133667, + "sampling/importance_sampling_ratio/mean": 1.000548005104065, + "sampling/importance_sampling_ratio/min": 0.7022053003311157, + "sampling/sampling_logp_difference/max": 0.4766886234283447, + "sampling/sampling_logp_difference/mean": 0.015693701803684235, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 190.40625, + "completions/mean_terminated_length": 190.40625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.38933029770851135, + "epoch": 2.269607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050897849937472595, + "kl": 0.04273064807057381, + "learning_rate": 1.7183609644824092e-07, + "loss": 0.0004, + "num_tokens": 58574476.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4586049318313599, + "sampling/importance_sampling_ratio/mean": 0.9996222853660583, + "sampling/importance_sampling_ratio/min": 0.6622437834739685, + "sampling/sampling_logp_difference/max": 0.4121215343475342, + "sampling/sampling_logp_difference/mean": 0.01635589264333248, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 207.515625, + "completions/mean_terminated_length": 207.515625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.40968212485313416, + "epoch": 2.2708333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7912864685020553, + "kl": 0.041417062282562256, + "learning_rate": 1.7129895673766575e-07, + "loss": 0.0109, + "num_tokens": 58604621.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.592902421951294, + "sampling/importance_sampling_ratio/mean": 0.999763548374176, + "sampling/importance_sampling_ratio/min": 0.635581374168396, + "sampling/sampling_logp_difference/max": 0.4655578136444092, + "sampling/sampling_logp_difference/mean": 0.01589188538491726, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 197.546875, + "completions/mean_terminated_length": 197.546875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3571816384792328, + "epoch": 2.2720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01711274830675318, + "kl": 0.023957177996635437, + "learning_rate": 1.707624842693844e-07, + "loss": 0.0002, + "num_tokens": 58643168.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5298007726669312, + "sampling/importance_sampling_ratio/mean": 0.9997508525848389, + "sampling/importance_sampling_ratio/min": 0.6468111872673035, + "sampling/sampling_logp_difference/max": 0.4357008934020996, + "sampling/sampling_logp_difference/mean": 0.014274682849645615, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 173.765625, + "completions/mean_terminated_length": 173.765625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.34523043036460876, + "epoch": 2.2732843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020056058550808338, + "kl": 0.03291430324316025, + "learning_rate": 1.7022668013240227e-07, + "loss": 0.0003, + "num_tokens": 58671761.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2966854572296143, + "sampling/importance_sampling_ratio/mean": 0.9996277093887329, + "sampling/importance_sampling_ratio/min": 0.6234764456748962, + "sampling/sampling_logp_difference/max": 0.4724442958831787, + "sampling/sampling_logp_difference/mean": 0.013802153058350086, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 175.09375, + "completions/mean_terminated_length": 175.09375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.460713267326355, + "epoch": 2.2745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025380685321179753, + "kl": 0.04328715801239014, + "learning_rate": 1.696915454143676e-07, + "loss": 0.0005, + "num_tokens": 58699495.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6357195377349854, + "sampling/importance_sampling_ratio/mean": 1.00018310546875, + "sampling/importance_sampling_ratio/min": 0.6576760411262512, + "sampling/sampling_logp_difference/max": 0.4920828342437744, + "sampling/sampling_logp_difference/mean": 0.017435822635889053, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 207.796875, + "completions/mean_terminated_length": 207.796875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.45708906650543213, + "epoch": 2.275735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8878465244198234, + "kl": 0.04913056641817093, + "learning_rate": 1.691570812015704e-07, + "loss": 0.0004, + "num_tokens": 58731610.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6162848472595215, + "sampling/importance_sampling_ratio/mean": 0.9995481371879578, + "sampling/importance_sampling_ratio/min": 0.7015320658683777, + "sampling/sampling_logp_difference/max": 0.4801301956176758, + "sampling/sampling_logp_difference/mean": 0.016341743990778923, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 188.484375, + "completions/mean_terminated_length": 188.484375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3209288716316223, + "epoch": 2.2769607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017785051769217963, + "kl": 0.021816005930304527, + "learning_rate": 1.6862328857893855e-07, + "loss": 0.0002, + "num_tokens": 58760393.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6555925607681274, + "sampling/importance_sampling_ratio/mean": 1.0000321865081787, + "sampling/importance_sampling_ratio/min": 0.6114137172698975, + "sampling/sampling_logp_difference/max": 0.5041589736938477, + "sampling/sampling_logp_difference/mean": 0.014840750023722649, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 201.890625, + "completions/mean_terminated_length": 201.890625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.5158641934394836, + "epoch": 2.278186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9370403207892111, + "kl": 0.06312260031700134, + "learning_rate": 1.680901686300376e-07, + "loss": 0.0314, + "num_tokens": 58795218.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4305166006088257, + "sampling/importance_sampling_ratio/mean": 1.0003159046173096, + "sampling/importance_sampling_ratio/min": 0.6245165467262268, + "sampling/sampling_logp_difference/max": 0.4707775115966797, + "sampling/sampling_logp_difference/mean": 0.01818527653813362, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 202.546875, + "completions/mean_terminated_length": 202.546875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4285881519317627, + "epoch": 2.2794117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025241984979734046, + "kl": 0.034309498965740204, + "learning_rate": 1.6755772243706712e-07, + "loss": 0.0003, + "num_tokens": 58826037.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6094081401824951, + "sampling/importance_sampling_ratio/mean": 0.9999540448188782, + "sampling/importance_sampling_ratio/min": 0.6262832880020142, + "sampling/sampling_logp_difference/max": 0.47586655616760254, + "sampling/sampling_logp_difference/mean": 0.01637199893593788, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 207.8125, + "completions/mean_terminated_length": 207.8125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.44034770131111145, + "epoch": 2.280637254901961, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9075536893096079, + "kl": 0.060066547244787216, + "learning_rate": 1.6702595108085942e-07, + "loss": 0.018, + "num_tokens": 58860473.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3368315696716309, + "sampling/importance_sampling_ratio/mean": 0.999178409576416, + "sampling/importance_sampling_ratio/min": 0.6380237936973572, + "sampling/sampling_logp_difference/max": 0.44937968254089355, + "sampling/sampling_logp_difference/mean": 0.015396175906062126, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 192.296875, + "completions/mean_terminated_length": 192.296875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4821186363697052, + "epoch": 2.281862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7283525685923916, + "kl": 0.05311132222414017, + "learning_rate": 1.6649485564087644e-07, + "loss": -0.0138, + "num_tokens": 58893596.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5695596933364868, + "sampling/importance_sampling_ratio/mean": 1.000570297241211, + "sampling/importance_sampling_ratio/min": 0.7008822560310364, + "sampling/sampling_logp_difference/max": 0.45079517364501953, + "sampling/sampling_logp_difference/mean": 0.01771252416074276, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 157.078125, + "completions/mean_terminated_length": 157.078125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.36174434423446655, + "epoch": 2.2830882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026088477295273103, + "kl": 0.034807972609996796, + "learning_rate": 1.6596443719520826e-07, + "loss": 0.0003, + "num_tokens": 58919969.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3641308546066284, + "sampling/importance_sampling_ratio/mean": 1.0005955696105957, + "sampling/importance_sampling_ratio/min": 0.6953715085983276, + "sampling/sampling_logp_difference/max": 0.36330902576446533, + "sampling/sampling_logp_difference/mean": 0.014958731830120087, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 202.734375, + "completions/mean_terminated_length": 202.734375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3093493580818176, + "epoch": 2.284313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016420602162738714, + "kl": 0.02179465815424919, + "learning_rate": 1.6543469682057104e-07, + "loss": 0.0002, + "num_tokens": 58949712.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.597166657447815, + "sampling/importance_sampling_ratio/mean": 1.000213861465454, + "sampling/importance_sampling_ratio/min": 0.6173874139785767, + "sampling/sampling_logp_difference/max": 0.48225855827331543, + "sampling/sampling_logp_difference/mean": 0.013449499383568764, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 182.671875, + "completions/mean_terminated_length": 182.671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.34986114501953125, + "epoch": 2.2855392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018810455421061027, + "kl": 0.02657250314950943, + "learning_rate": 1.6490563559230357e-07, + "loss": 0.0003, + "num_tokens": 58976267.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3779224157333374, + "sampling/importance_sampling_ratio/mean": 0.9998255372047424, + "sampling/importance_sampling_ratio/min": 0.6256580948829651, + "sampling/sampling_logp_difference/max": 0.4689512252807617, + "sampling/sampling_logp_difference/mean": 0.014288580045104027, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 227.0, + "completions/mean_terminated_length": 227.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.5183290243148804, + "epoch": 2.286764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8244515277802735, + "kl": 0.045404620468616486, + "learning_rate": 1.6437725458436725e-07, + "loss": -0.0018, + "num_tokens": 59007979.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.540433406829834, + "sampling/importance_sampling_ratio/mean": 1.0007423162460327, + "sampling/importance_sampling_ratio/min": 0.6332263946533203, + "sampling/sampling_logp_difference/max": 0.4569272994995117, + "sampling/sampling_logp_difference/mean": 0.01804598607122898, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 168.375, + "completions/mean_terminated_length": 168.375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.28167539834976196, + "epoch": 2.2879901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9389971339151029, + "kl": 0.03808104991912842, + "learning_rate": 1.6384955486934154e-07, + "loss": -0.0166, + "num_tokens": 59034019.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6505603790283203, + "sampling/importance_sampling_ratio/mean": 1.0002169609069824, + "sampling/importance_sampling_ratio/min": 0.6622359156608582, + "sampling/sampling_logp_difference/max": 0.5011148452758789, + "sampling/sampling_logp_difference/mean": 0.012344785034656525, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 201.3125, + "completions/mean_terminated_length": 201.3125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3862236738204956, + "epoch": 2.2892156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02561505746737744, + "kl": 0.028078967705368996, + "learning_rate": 1.633225375184239e-07, + "loss": 0.0003, + "num_tokens": 59064039.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.475394368171692, + "sampling/importance_sampling_ratio/mean": 1.000014066696167, + "sampling/importance_sampling_ratio/min": 0.6452736258506775, + "sampling/sampling_logp_difference/max": 0.43808090686798096, + "sampling/sampling_logp_difference/mean": 0.015584070235490799, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 164.625, + "completions/mean_terminated_length": 164.625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.40446603298187256, + "epoch": 2.2904411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9237385409935693, + "kl": 0.03162943571805954, + "learning_rate": 1.6279620360142594e-07, + "loss": -0.0055, + "num_tokens": 59087935.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.3203250169754028, + "sampling/importance_sampling_ratio/mean": 0.9996763467788696, + "sampling/importance_sampling_ratio/min": 0.6391890048980713, + "sampling/sampling_logp_difference/max": 0.4475550651550293, + "sampling/sampling_logp_difference/mean": 0.016145117580890656, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 164.15625, + "completions/mean_terminated_length": 164.15625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.34835562109947205, + "epoch": 2.2916666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018429910053984654, + "kl": 0.025264177471399307, + "learning_rate": 1.62270554186772e-07, + "loss": 0.0003, + "num_tokens": 59112665.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3424992561340332, + "sampling/importance_sampling_ratio/mean": 1.0000231266021729, + "sampling/importance_sampling_ratio/min": 0.6302787661552429, + "sampling/sampling_logp_difference/max": 0.4615931510925293, + "sampling/sampling_logp_difference/mean": 0.015209256671369076, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 189.34375, + "completions/mean_terminated_length": 189.34375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4359246492385864, + "epoch": 2.292892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042675306786993814, + "kl": 0.06339074671268463, + "learning_rate": 1.6174559034149737e-07, + "loss": 0.0007, + "num_tokens": 59143151.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4962064027786255, + "sampling/importance_sampling_ratio/mean": 1.0000028610229492, + "sampling/importance_sampling_ratio/min": 0.37432149052619934, + "sampling/sampling_logp_difference/max": 0.982640266418457, + "sampling/sampling_logp_difference/mean": 0.01693284697830677, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 200.421875, + "completions/mean_terminated_length": 200.421875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.38589876890182495, + "epoch": 2.2941176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9712494212671277, + "kl": 0.02685917168855667, + "learning_rate": 1.6122131313124538e-07, + "loss": 0.0914, + "num_tokens": 59174746.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5962004661560059, + "sampling/importance_sampling_ratio/mean": 0.99989253282547, + "sampling/importance_sampling_ratio/min": 0.6143203377723694, + "sampling/sampling_logp_difference/max": 0.4872387647628784, + "sampling/sampling_logp_difference/mean": 0.015417475253343582, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 209.390625, + "completions/mean_terminated_length": 209.390625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.5156528949737549, + "epoch": 2.295343137254902, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4502040434930394, + "kl": 0.14193961024284363, + "learning_rate": 1.606977236202654e-07, + "loss": -0.0113, + "num_tokens": 59204131.0, + "reward": -0.15625, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.3224138021469116, + "sampling/importance_sampling_ratio/mean": 1.0005409717559814, + "sampling/importance_sampling_ratio/min": 0.6412133574485779, + "sampling/sampling_logp_difference/max": 0.4443930387496948, + "sampling/sampling_logp_difference/mean": 0.016879171133041382, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 167.953125, + "completions/mean_terminated_length": 167.953125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.36485856771469116, + "epoch": 2.2965686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03095722801658572, + "kl": 0.07888477295637131, + "learning_rate": 1.6017482287141088e-07, + "loss": 0.0007, + "num_tokens": 59230272.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.311119556427002, + "sampling/importance_sampling_ratio/mean": 1.0003852844238281, + "sampling/importance_sampling_ratio/min": 0.47601020336151123, + "sampling/sampling_logp_difference/max": 0.7423160076141357, + "sampling/sampling_logp_difference/mean": 0.014451163820922375, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 187.546875, + "completions/mean_terminated_length": 187.546875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.43709540367126465, + "epoch": 2.297794117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019095308614842953, + "kl": 0.03428737074136734, + "learning_rate": 1.5965261194613755e-07, + "loss": 0.0003, + "num_tokens": 59257747.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.432391881942749, + "sampling/importance_sampling_ratio/mean": 1.000012993812561, + "sampling/importance_sampling_ratio/min": 0.613330066204071, + "sampling/sampling_logp_difference/max": 0.48885202407836914, + "sampling/sampling_logp_difference/mean": 0.017625847831368446, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 209.25, + "completions/mean_terminated_length": 209.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5272804498672485, + "epoch": 2.299019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9897238317528053, + "kl": 0.0817376896739006, + "learning_rate": 1.591310919045003e-07, + "loss": -0.0088, + "num_tokens": 59286787.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5805723667144775, + "sampling/importance_sampling_ratio/mean": 0.9994344115257263, + "sampling/importance_sampling_ratio/min": 0.6074384450912476, + "sampling/sampling_logp_difference/max": 0.4985044002532959, + "sampling/sampling_logp_difference/mean": 0.01894564740359783, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 216.296875, + "completions/mean_terminated_length": 216.296875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.39964842796325684, + "epoch": 2.3002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7348597119880874, + "kl": 0.03691429644823074, + "learning_rate": 1.5861026380515163e-07, + "loss": 0.0311, + "num_tokens": 59316134.0, + "reward": -0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5519212484359741, + "sampling/importance_sampling_ratio/mean": 1.000169038772583, + "sampling/importance_sampling_ratio/min": 0.6844035387039185, + "sampling/sampling_logp_difference/max": 0.43949365615844727, + "sampling/sampling_logp_difference/mean": 0.015072993002831936, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 257.84375, + "completions/mean_terminated_length": 257.84375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.40951257944107056, + "epoch": 2.301470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5938953544983316, + "kl": 0.03216740861535072, + "learning_rate": 1.5809012870533995e-07, + "loss": -0.0389, + "num_tokens": 59350492.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 1.0002248287200928, + "sampling/importance_sampling_ratio/min": 0.6401075720787048, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.014520933851599693, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 226.734375, + "completions/mean_terminated_length": 226.734375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.36163556575775146, + "epoch": 2.3026960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016005730270182657, + "kl": 0.029519587755203247, + "learning_rate": 1.575706876609063e-07, + "loss": 0.0003, + "num_tokens": 59387131.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6274335384368896, + "sampling/importance_sampling_ratio/mean": 1.0002219676971436, + "sampling/importance_sampling_ratio/min": 0.7300496697425842, + "sampling/sampling_logp_difference/max": 0.48700428009033203, + "sampling/sampling_logp_difference/mean": 0.01361087616533041, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 192.703125, + "completions/mean_terminated_length": 192.703125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4290919005870819, + "epoch": 2.303921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9125959034845414, + "kl": 0.028339726850390434, + "learning_rate": 1.5705194172628323e-07, + "loss": -0.0228, + "num_tokens": 59422232.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5048365592956543, + "sampling/importance_sampling_ratio/mean": 0.999923586845398, + "sampling/importance_sampling_ratio/min": 0.6614570617675781, + "sampling/sampling_logp_difference/max": 0.41331028938293457, + "sampling/sampling_logp_difference/mean": 0.014484135434031487, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 213.71875, + "completions/mean_terminated_length": 213.71875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4463856816291809, + "epoch": 2.3051470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8794570250768244, + "kl": 0.027920614928007126, + "learning_rate": 1.565338919544918e-07, + "loss": 0.0258, + "num_tokens": 59456966.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5813827514648438, + "sampling/importance_sampling_ratio/mean": 0.9998728036880493, + "sampling/importance_sampling_ratio/min": 0.6990068554878235, + "sampling/sampling_logp_difference/max": 0.4582996368408203, + "sampling/sampling_logp_difference/mean": 0.01653440296649933, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 226.8125, + "completions/mean_terminated_length": 226.8125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4854315221309662, + "epoch": 2.306372549019608, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1680551018341985, + "kl": 0.05970364809036255, + "learning_rate": 1.5601653939714072e-07, + "loss": 0.029, + "num_tokens": 59496746.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.620776653289795, + "sampling/importance_sampling_ratio/mean": 1.0002939701080322, + "sampling/importance_sampling_ratio/min": 0.4305850863456726, + "sampling/sampling_logp_difference/max": 0.8426103591918945, + "sampling/sampling_logp_difference/mean": 0.015956567600369453, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 205.40625, + "completions/mean_terminated_length": 205.40625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3487470746040344, + "epoch": 2.3075980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015999218989689683, + "kl": 0.022527508437633514, + "learning_rate": 1.5549988510442258e-07, + "loss": 0.0002, + "num_tokens": 59529108.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999996542930603, + "sampling/importance_sampling_ratio/min": 0.5221103429794312, + "sampling/sampling_logp_difference/max": 0.7264072895050049, + "sampling/sampling_logp_difference/mean": 0.014306320808827877, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 179.375, + "completions/mean_terminated_length": 179.375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3543795645236969, + "epoch": 2.3088235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017097044265547215, + "kl": 0.02797403559088707, + "learning_rate": 1.5498393012511285e-07, + "loss": 0.0003, + "num_tokens": 59557260.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.564490556716919, + "sampling/importance_sampling_ratio/mean": 1.0002918243408203, + "sampling/importance_sampling_ratio/min": 0.6820248961448669, + "sampling/sampling_logp_difference/max": 0.44756031036376953, + "sampling/sampling_logp_difference/mean": 0.014995114877820015, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 160.234375, + "completions/mean_terminated_length": 160.234375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.39390504360198975, + "epoch": 2.310049019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0198756834092795, + "kl": 0.06174597516655922, + "learning_rate": 1.5446867550656767e-07, + "loss": 0.0043, + "num_tokens": 59581579.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4194951057434082, + "sampling/importance_sampling_ratio/mean": 0.9993045330047607, + "sampling/importance_sampling_ratio/min": 0.692795991897583, + "sampling/sampling_logp_difference/max": 0.3670196533203125, + "sampling/sampling_logp_difference/mean": 0.015669317916035652, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 171.84375, + "completions/mean_terminated_length": 171.84375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3988805413246155, + "epoch": 2.311274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1363833495861986, + "kl": 0.03849609196186066, + "learning_rate": 1.5395412229472103e-07, + "loss": -0.0694, + "num_tokens": 59616561.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.382973551750183, + "sampling/importance_sampling_ratio/mean": 0.9999716281890869, + "sampling/importance_sampling_ratio/min": 0.64705890417099, + "sampling/sampling_logp_difference/max": 0.4353179931640625, + "sampling/sampling_logp_difference/mean": 0.01593279466032982, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 172.015625, + "completions/mean_terminated_length": 172.015625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3996015191078186, + "epoch": 2.3125, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1523603472810369, + "kl": 0.039179421961307526, + "learning_rate": 1.5344027153408374e-07, + "loss": 0.0099, + "num_tokens": 59655074.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.3922832012176514, + "sampling/importance_sampling_ratio/mean": 0.9999604225158691, + "sampling/importance_sampling_ratio/min": 0.6298457980155945, + "sampling/sampling_logp_difference/max": 0.4622802734375, + "sampling/sampling_logp_difference/mean": 0.015173811465501785, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 196.21875, + "completions/mean_terminated_length": 196.21875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.32767951488494873, + "epoch": 2.313725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9607446647164233, + "kl": 0.024808941408991814, + "learning_rate": 1.5292712426773973e-07, + "loss": 0.0369, + "num_tokens": 59683280.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5017434358596802, + "sampling/importance_sampling_ratio/mean": 0.9995311498641968, + "sampling/importance_sampling_ratio/min": 0.7266528606414795, + "sampling/sampling_logp_difference/max": 0.40662670135498047, + "sampling/sampling_logp_difference/mean": 0.01315168384462595, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 193.484375, + "completions/mean_terminated_length": 193.484375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.30524688959121704, + "epoch": 2.314950980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015802220938073968, + "kl": 0.022244073450565338, + "learning_rate": 1.5241468153734594e-07, + "loss": 0.0002, + "num_tokens": 59721327.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071572065353394, + "sampling/importance_sampling_ratio/mean": 0.9994029402732849, + "sampling/importance_sampling_ratio/min": 0.6557306051254272, + "sampling/sampling_logp_difference/max": 0.42200517654418945, + "sampling/sampling_logp_difference/mean": 0.012906955555081367, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 158.75, + "completions/mean_terminated_length": 158.75, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3782504200935364, + "epoch": 2.3161764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.847135206429144, + "kl": 0.06033479794859886, + "learning_rate": 1.5190294438312834e-07, + "loss": -0.0002, + "num_tokens": 59746959.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.300763487815857, + "sampling/importance_sampling_ratio/mean": 0.9997039437294006, + "sampling/importance_sampling_ratio/min": 0.62606281042099, + "sampling/sampling_logp_difference/max": 0.4683046340942383, + "sampling/sampling_logp_difference/mean": 0.015130390413105488, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 154.640625, + "completions/mean_terminated_length": 154.640625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.37160420417785645, + "epoch": 2.3174019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0895433055967554, + "kl": 0.0382518470287323, + "learning_rate": 1.5139191384388094e-07, + "loss": 0.003, + "num_tokens": 59771416.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6412906646728516, + "sampling/importance_sampling_ratio/mean": 0.999275267124176, + "sampling/importance_sampling_ratio/min": 0.662240207195282, + "sampling/sampling_logp_difference/max": 0.4954829216003418, + "sampling/sampling_logp_difference/mean": 0.01576092839241028, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 257.046875, + "completions/mean_terminated_length": 257.046875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.253875195980072, + "epoch": 2.318627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011137341327517333, + "kl": 0.015241425484418869, + "learning_rate": 1.5088159095696362e-07, + "loss": 0.0001, + "num_tokens": 59802907.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6928167343139648, + "sampling/importance_sampling_ratio/mean": 1.0005075931549072, + "sampling/importance_sampling_ratio/min": 0.6532841324806213, + "sampling/sampling_logp_difference/max": 0.5263938903808594, + "sampling/sampling_logp_difference/mean": 0.010319976136088371, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 216.09375, + "completions/mean_terminated_length": 216.09375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.2632845640182495, + "epoch": 2.3198529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01943818221351062, + "kl": 0.024373536929488182, + "learning_rate": 1.5037197675829916e-07, + "loss": 0.0002, + "num_tokens": 59837681.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.0005693435668945, + "sampling/importance_sampling_ratio/min": 0.6622360348701477, + "sampling/sampling_logp_difference/max": 0.4361441135406494, + "sampling/sampling_logp_difference/mean": 0.011405350640416145, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 221.046875, + "completions/mean_terminated_length": 221.046875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3872997760772705, + "epoch": 2.321078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014508614460023527, + "kl": 0.023506823927164078, + "learning_rate": 1.4986307228237267e-07, + "loss": 0.0002, + "num_tokens": 59875668.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4615130424499512, + "sampling/importance_sampling_ratio/mean": 1.0005359649658203, + "sampling/importance_sampling_ratio/min": 0.6202294230461121, + "sampling/sampling_logp_difference/max": 0.47766590118408203, + "sampling/sampling_logp_difference/mean": 0.014735687524080276, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 195.734375, + "completions/mean_terminated_length": 195.734375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.48053818941116333, + "epoch": 2.3223039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8543515042561025, + "kl": 0.047940317541360855, + "learning_rate": 1.4935487856222723e-07, + "loss": -0.0152, + "num_tokens": 59908611.0, + "reward": -0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": -0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.3627339601516724, + "sampling/importance_sampling_ratio/mean": 1.0002124309539795, + "sampling/importance_sampling_ratio/min": 0.7186917662620544, + "sampling/sampling_logp_difference/max": 0.3303227424621582, + "sampling/sampling_logp_difference/mean": 0.01626776158809662, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 218.859375, + "completions/mean_terminated_length": 218.859375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3811754584312439, + "epoch": 2.323529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016649727675794618, + "kl": 0.028688944876194, + "learning_rate": 1.4884739662946445e-07, + "loss": 0.0003, + "num_tokens": 59943594.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 0.999620795249939, + "sampling/importance_sampling_ratio/min": 0.5384246110916138, + "sampling/sampling_logp_difference/max": 0.6191077828407288, + "sampling/sampling_logp_difference/mean": 0.0145262461155653, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 196.234375, + "completions/mean_terminated_length": 196.234375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3703479766845703, + "epoch": 2.3247549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015270272827534641, + "kl": 0.02394244819879532, + "learning_rate": 1.4834062751424015e-07, + "loss": 0.0002, + "num_tokens": 59976473.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6088929176330566, + "sampling/importance_sampling_ratio/mean": 0.9999560713768005, + "sampling/importance_sampling_ratio/min": 0.5807967185974121, + "sampling/sampling_logp_difference/max": 0.5433545112609863, + "sampling/sampling_logp_difference/mean": 0.01434406265616417, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 204.1875, + "completions/mean_terminated_length": 204.1875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3638109564781189, + "epoch": 2.325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013468778188168539, + "kl": 0.02090836875140667, + "learning_rate": 1.478345722452639e-07, + "loss": 0.0002, + "num_tokens": 60005701.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5052893161773682, + "sampling/importance_sampling_ratio/mean": 0.9993464946746826, + "sampling/importance_sampling_ratio/min": 0.5578929781913757, + "sampling/sampling_logp_difference/max": 0.5835881233215332, + "sampling/sampling_logp_difference/mean": 0.014573956839740276, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 162.984375, + "completions/mean_terminated_length": 162.984375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3963279724121094, + "epoch": 2.327205882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01904312950438511, + "kl": 0.03473995253443718, + "learning_rate": 1.4732923184979562e-07, + "loss": 0.0003, + "num_tokens": 60033428.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.410925269126892, + "sampling/importance_sampling_ratio/mean": 0.9995149970054626, + "sampling/importance_sampling_ratio/min": 0.6589466333389282, + "sampling/sampling_logp_difference/max": 0.4171128273010254, + "sampling/sampling_logp_difference/mean": 0.015039639547467232, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 197.515625, + "completions/mean_terminated_length": 197.515625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.39951860904693604, + "epoch": 2.3284313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8576159884898699, + "kl": 0.0305451862514019, + "learning_rate": 1.4682460735364422e-07, + "loss": 0.0163, + "num_tokens": 60062565.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.3865586519241333, + "sampling/importance_sampling_ratio/mean": 1.0001012086868286, + "sampling/importance_sampling_ratio/min": 0.6482308506965637, + "sampling/sampling_logp_difference/max": 0.43350839614868164, + "sampling/sampling_logp_difference/mean": 0.014886585995554924, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.2817862927913666, + "epoch": 2.329656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015977006957823323, + "kl": 0.01980246603488922, + "learning_rate": 1.4632069978116584e-07, + "loss": 0.0002, + "num_tokens": 60092269.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5823426246643066, + "sampling/importance_sampling_ratio/mean": 1.0000944137573242, + "sampling/importance_sampling_ratio/min": 0.606020987033844, + "sampling/sampling_logp_difference/max": 0.5008406639099121, + "sampling/sampling_logp_difference/mean": 0.012155575677752495, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 180.203125, + "completions/mean_terminated_length": 180.203125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3786773979663849, + "epoch": 2.3308823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017527550338411666, + "kl": 0.02328067645430565, + "learning_rate": 1.4581751015526033e-07, + "loss": 0.0002, + "num_tokens": 60120810.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6157784461975098, + "sampling/importance_sampling_ratio/mean": 1.0003771781921387, + "sampling/importance_sampling_ratio/min": 0.6090195775032043, + "sampling/sampling_logp_difference/max": 0.49590492248535156, + "sampling/sampling_logp_difference/mean": 0.015042027458548546, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 179.1875, + "completions/mean_terminated_length": 179.1875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4272390604019165, + "epoch": 2.332107843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0790700487473894, + "kl": 0.03297770768404007, + "learning_rate": 1.4531503949737106e-07, + "loss": 0.0314, + "num_tokens": 60152566.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5092428922653198, + "sampling/importance_sampling_ratio/mean": 1.0000388622283936, + "sampling/importance_sampling_ratio/min": 0.6622360348701477, + "sampling/sampling_logp_difference/max": 0.41213321685791016, + "sampling/sampling_logp_difference/mean": 0.016097504645586014, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 203.625, + "completions/mean_terminated_length": 203.625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.28215962648391724, + "epoch": 2.3333333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019570314729581942, + "kl": 0.022349946200847626, + "learning_rate": 1.4481328882748184e-07, + "loss": 0.0002, + "num_tokens": 60180814.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3916409015655518, + "sampling/importance_sampling_ratio/mean": 0.9997809529304504, + "sampling/importance_sampling_ratio/min": 0.4278160035610199, + "sampling/sampling_logp_difference/max": 0.8490620851516724, + "sampling/sampling_logp_difference/mean": 0.011633295565843582, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 186.203125, + "completions/mean_terminated_length": 186.203125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.37325119972229004, + "epoch": 2.3345588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9482082358343803, + "kl": 0.03111817128956318, + "learning_rate": 1.4431225916411455e-07, + "loss": -0.0226, + "num_tokens": 60207579.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.345187783241272, + "sampling/importance_sampling_ratio/mean": 1.000281572341919, + "sampling/importance_sampling_ratio/min": 0.6062778234481812, + "sampling/sampling_logp_difference/max": 0.5004169940948486, + "sampling/sampling_logp_difference/mean": 0.014844301156699657, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 198.203125, + "completions/mean_terminated_length": 198.203125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.45078957080841064, + "epoch": 2.3357843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8205068788653546, + "kl": 0.057336561381816864, + "learning_rate": 1.4381195152432769e-07, + "loss": -0.0238, + "num_tokens": 60242392.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5392322540283203, + "sampling/importance_sampling_ratio/mean": 0.9996870756149292, + "sampling/importance_sampling_ratio/min": 0.5160008668899536, + "sampling/sampling_logp_difference/max": 0.661646842956543, + "sampling/sampling_logp_difference/mean": 0.017214279621839523, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 148.90625, + "completions/mean_terminated_length": 148.90625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.34201350808143616, + "epoch": 2.3370098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05245526815709473, + "kl": 0.0537988543510437, + "learning_rate": 1.4331236692371384e-07, + "loss": 0.0005, + "num_tokens": 60266034.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.602060079574585, + "sampling/importance_sampling_ratio/mean": 0.9996627569198608, + "sampling/importance_sampling_ratio/min": 0.699406087398529, + "sampling/sampling_logp_difference/max": 0.47129034996032715, + "sampling/sampling_logp_difference/mean": 0.01396908238530159, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 176.359375, + "completions/mean_terminated_length": 176.359375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.41292208433151245, + "epoch": 2.338235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1055960854986142, + "kl": 0.033058423548936844, + "learning_rate": 1.428135063763985e-07, + "loss": -0.042, + "num_tokens": 60298713.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6507264375686646, + "sampling/importance_sampling_ratio/mean": 1.0003893375396729, + "sampling/importance_sampling_ratio/min": 0.6855221390724182, + "sampling/sampling_logp_difference/max": 0.5012154579162598, + "sampling/sampling_logp_difference/mean": 0.01686900667846203, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 193.0625, + "completions/mean_terminated_length": 193.0625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.374266654253006, + "epoch": 2.3394607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02167045988977778, + "kl": 0.04262005537748337, + "learning_rate": 1.4231537089503675e-07, + "loss": 0.0004, + "num_tokens": 60329613.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.539963722229004, + "sampling/importance_sampling_ratio/mean": 1.000189185142517, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.015530496835708618, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 260.796875, + "completions/mean_terminated_length": 260.796875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.4065209925174713, + "epoch": 2.340686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7441707090446501, + "kl": 0.024460311979055405, + "learning_rate": 1.4181796149081194e-07, + "loss": -0.0102, + "num_tokens": 60367152.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4363536834716797, + "sampling/importance_sampling_ratio/mean": 0.999853789806366, + "sampling/importance_sampling_ratio/min": 0.4462917745113373, + "sampling/sampling_logp_difference/max": 0.8067823648452759, + "sampling/sampling_logp_difference/mean": 0.013409093022346497, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 178.828125, + "completions/mean_terminated_length": 178.828125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.37713760137557983, + "epoch": 2.3419117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9662019583827675, + "kl": 0.05671603977680206, + "learning_rate": 1.4132127917343394e-07, + "loss": -0.0134, + "num_tokens": 60394981.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.566908359527588, + "sampling/importance_sampling_ratio/mean": 0.9996801614761353, + "sampling/importance_sampling_ratio/min": 0.702597439289093, + "sampling/sampling_logp_difference/max": 0.44910454750061035, + "sampling/sampling_logp_difference/mean": 0.014646901749074459, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 181.28125, + "completions/mean_terminated_length": 181.28125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.44547849893569946, + "epoch": 2.343137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0625432473513456, + "kl": 0.032026518136262894, + "learning_rate": 1.4082532495113624e-07, + "loss": -0.005, + "num_tokens": 60423527.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.487159252166748, + "sampling/importance_sampling_ratio/mean": 1.0001486539840698, + "sampling/importance_sampling_ratio/min": 0.6141946315765381, + "sampling/sampling_logp_difference/max": 0.4874434471130371, + "sampling/sampling_logp_difference/mean": 0.015257124789059162, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 218.34375, + "completions/mean_terminated_length": 218.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.26839399337768555, + "epoch": 2.344362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9766257339179132, + "kl": 0.028687234967947006, + "learning_rate": 1.4033009983067452e-07, + "loss": 0.0311, + "num_tokens": 60457021.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.407607913017273, + "sampling/importance_sampling_ratio/mean": 0.9995027780532837, + "sampling/importance_sampling_ratio/min": 0.5688512921333313, + "sampling/sampling_logp_difference/max": 0.564136266708374, + "sampling/sampling_logp_difference/mean": 0.010518480092287064, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 191.34375, + "completions/mean_terminated_length": 191.34375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.42183470726013184, + "epoch": 2.3455882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0375708725802242, + "kl": 0.033426955342292786, + "learning_rate": 1.398356048173242e-07, + "loss": -0.0042, + "num_tokens": 60489475.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.627564787864685, + "sampling/importance_sampling_ratio/mean": 1.0002881288528442, + "sampling/importance_sampling_ratio/min": 0.6455309391021729, + "sampling/sampling_logp_difference/max": 0.48708486557006836, + "sampling/sampling_logp_difference/mean": 0.01605754718184471, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 221.796875, + "completions/mean_terminated_length": 221.796875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35900455713272095, + "epoch": 2.346813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7729176553637848, + "kl": 0.026185456663370132, + "learning_rate": 1.3934184091487915e-07, + "loss": 0.0099, + "num_tokens": 60518086.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.8179771900177002, + "sampling/importance_sampling_ratio/mean": 1.0001744031906128, + "sampling/importance_sampling_ratio/min": 0.706462562084198, + "sampling/sampling_logp_difference/max": 0.597724437713623, + "sampling/sampling_logp_difference/mean": 0.013698907569050789, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 160.96875, + "completions/mean_terminated_length": 160.96875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.29528480768203735, + "epoch": 2.3480392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01652170370905375, + "kl": 0.022979356348514557, + "learning_rate": 1.3884880912564873e-07, + "loss": 0.0002, + "num_tokens": 60544388.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6976975202560425, + "sampling/importance_sampling_ratio/mean": 1.0002460479736328, + "sampling/importance_sampling_ratio/min": 0.5719316601753235, + "sampling/sampling_logp_difference/max": 0.558735728263855, + "sampling/sampling_logp_difference/mean": 0.012866152450442314, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 186.9375, + "completions/mean_terminated_length": 186.9375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.31954970955848694, + "epoch": 2.349264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018463198147938467, + "kl": 0.025077354162931442, + "learning_rate": 1.3835651045045598e-07, + "loss": 0.0002, + "num_tokens": 60570464.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6093238592147827, + "sampling/importance_sampling_ratio/mean": 1.0000442266464233, + "sampling/importance_sampling_ratio/min": 0.6098756790161133, + "sampling/sampling_logp_difference/max": 0.49450016021728516, + "sampling/sampling_logp_difference/mean": 0.014753278344869614, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 204.015625, + "completions/mean_terminated_length": 204.015625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3832298517227173, + "epoch": 2.3504901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0315911254897412, + "kl": 0.02888948656618595, + "learning_rate": 1.3786494588863633e-07, + "loss": 0.026, + "num_tokens": 60610497.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.417617678642273, + "sampling/importance_sampling_ratio/mean": 1.0000133514404297, + "sampling/importance_sampling_ratio/min": 0.7053484320640564, + "sampling/sampling_logp_difference/max": 0.3490633964538574, + "sampling/sampling_logp_difference/mean": 0.013524937443435192, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 209.84375, + "completions/mean_terminated_length": 209.84375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3601645231246948, + "epoch": 2.3517156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013180003386095242, + "kl": 0.021510332822799683, + "learning_rate": 1.3737411643803448e-07, + "loss": 0.0002, + "num_tokens": 60641943.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4875458478927612, + "sampling/importance_sampling_ratio/mean": 0.999901294708252, + "sampling/importance_sampling_ratio/min": 0.6780306696891785, + "sampling/sampling_logp_difference/max": 0.397127628326416, + "sampling/sampling_logp_difference/mean": 0.013994856737554073, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 278.625, + "completions/mean_terminated_length": 278.625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4034101068973541, + "epoch": 2.3529411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3034565320338045, + "kl": 0.022126968950033188, + "learning_rate": 1.368840230950035e-07, + "loss": 0.0277, + "num_tokens": 60679679.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6274250745773315, + "sampling/importance_sampling_ratio/mean": 0.9997659921646118, + "sampling/importance_sampling_ratio/min": 0.620280385017395, + "sampling/sampling_logp_difference/max": 0.4869990348815918, + "sampling/sampling_logp_difference/mean": 0.014682994224131107, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 202.46875, + "completions/mean_terminated_length": 202.46875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3586995303630829, + "epoch": 2.3541666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013093767728225337, + "kl": 0.02074764110147953, + "learning_rate": 1.3639466685440132e-07, + "loss": 0.0002, + "num_tokens": 60710285.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9231685400009155, + "sampling/importance_sampling_ratio/mean": 1.0003397464752197, + "sampling/importance_sampling_ratio/min": 0.6959205269813538, + "sampling/sampling_logp_difference/max": 0.6539740562438965, + "sampling/sampling_logp_difference/mean": 0.014015990309417248, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 149.0, + "completions/mean_terminated_length": 149.0, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35728025436401367, + "epoch": 2.355392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02601540431378326, + "kl": 0.029771855100989342, + "learning_rate": 1.3590604870959043e-07, + "loss": 0.0003, + "num_tokens": 60735917.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5385515689849854, + "sampling/importance_sampling_ratio/mean": 0.9996775388717651, + "sampling/importance_sampling_ratio/min": 0.6397542357444763, + "sampling/sampling_logp_difference/max": 0.4466712474822998, + "sampling/sampling_logp_difference/mean": 0.014768035151064396, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 189.765625, + "completions/mean_terminated_length": 189.765625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3744673728942871, + "epoch": 2.3566176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0277436056638698, + "kl": 0.02428172156214714, + "learning_rate": 1.3541816965243462e-07, + "loss": 0.0418, + "num_tokens": 60769806.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6652601957321167, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.618177056312561, + "sampling/sampling_logp_difference/max": 0.5099813938140869, + "sampling/sampling_logp_difference/mean": 0.013300665654242039, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 175.6875, + "completions/mean_terminated_length": 175.6875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.348365843296051, + "epoch": 2.357843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024166011267922807, + "kl": 0.026825370267033577, + "learning_rate": 1.3493103067329737e-07, + "loss": 0.0003, + "num_tokens": 60799354.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5479344129562378, + "sampling/importance_sampling_ratio/mean": 0.9999157190322876, + "sampling/importance_sampling_ratio/min": 0.6347072124481201, + "sampling/sampling_logp_difference/max": 0.4545915126800537, + "sampling/sampling_logp_difference/mean": 0.01401049830019474, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 218.171875, + "completions/mean_terminated_length": 218.171875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.3539429306983948, + "epoch": 2.3590686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.747960810606086, + "kl": 0.03561093658208847, + "learning_rate": 1.3444463276104012e-07, + "loss": -0.0106, + "num_tokens": 60832293.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4050774574279785, + "sampling/importance_sampling_ratio/mean": 0.9996697902679443, + "sampling/importance_sampling_ratio/min": 0.6177712678909302, + "sampling/sampling_logp_difference/max": 0.48163700103759766, + "sampling/sampling_logp_difference/mean": 0.014360794797539711, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 216.046875, + "completions/mean_terminated_length": 216.046875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3854154944419861, + "epoch": 2.360294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9748618910236813, + "kl": 0.024287715554237366, + "learning_rate": 1.3395897690301966e-07, + "loss": 0.0918, + "num_tokens": 60866616.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9118987321853638, + "sampling/importance_sampling_ratio/mean": 1.0000795125961304, + "sampling/importance_sampling_ratio/min": 0.6256084442138672, + "sampling/sampling_logp_difference/max": 0.6480967998504639, + "sampling/sampling_logp_difference/mean": 0.013329725712537766, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 231.546875, + "completions/mean_terminated_length": 231.546875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.3768054246902466, + "epoch": 2.361519607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013843168965082383, + "kl": 0.019867606461048126, + "learning_rate": 1.3347406408508694e-07, + "loss": 0.0002, + "num_tokens": 60897451.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.424190878868103, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 0.684594452381134, + "sampling/sampling_logp_difference/max": 0.37892866134643555, + "sampling/sampling_logp_difference/mean": 0.014006221666932106, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 226.34375, + "completions/mean_terminated_length": 226.34375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.40192288160324097, + "epoch": 2.3627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7252619919133455, + "kl": 0.0270424522459507, + "learning_rate": 1.3298989529158378e-07, + "loss": 0.0071, + "num_tokens": 60932769.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6022827625274658, + "sampling/importance_sampling_ratio/mean": 1.000500202178955, + "sampling/importance_sampling_ratio/min": 0.6042662858963013, + "sampling/sampling_logp_difference/max": 0.5037403106689453, + "sampling/sampling_logp_difference/mean": 0.014340700581669807, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 160.171875, + "completions/mean_terminated_length": 160.171875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.29812660813331604, + "epoch": 2.363970588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9414259803642635, + "kl": 0.03467084467411041, + "learning_rate": 1.325064715053425e-07, + "loss": -0.0398, + "num_tokens": 60957116.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6463149785995483, + "sampling/importance_sampling_ratio/mean": 1.0005197525024414, + "sampling/importance_sampling_ratio/min": 0.6299277544021606, + "sampling/sampling_logp_difference/max": 0.49853944778442383, + "sampling/sampling_logp_difference/mean": 0.013119611889123917, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 223.46875, + "completions/mean_terminated_length": 223.46875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.47285810112953186, + "epoch": 2.3651960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8889440291759665, + "kl": 0.036731865257024765, + "learning_rate": 1.320237937076825e-07, + "loss": -0.0301, + "num_tokens": 60989866.0, + "reward": -0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.3725612163543701, + "sampling/importance_sampling_ratio/mean": 1.0006674528121948, + "sampling/importance_sampling_ratio/min": 0.5886189937591553, + "sampling/sampling_logp_difference/max": 0.5299761891365051, + "sampling/sampling_logp_difference/mean": 0.01706557720899582, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 190.1875, + "completions/mean_terminated_length": 190.1875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.38296884298324585, + "epoch": 2.366421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020030366923427478, + "kl": 0.03225460276007652, + "learning_rate": 1.3154186287840946e-07, + "loss": 0.0003, + "num_tokens": 61021174.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6649678945541382, + "sampling/importance_sampling_ratio/mean": 0.9999372959136963, + "sampling/importance_sampling_ratio/min": 0.5696814060211182, + "sampling/sampling_logp_difference/max": 0.5626779794692993, + "sampling/sampling_logp_difference/mean": 0.016994740813970566, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 185.484375, + "completions/mean_terminated_length": 185.484375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.44133448600769043, + "epoch": 2.3676470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6970894685886355, + "kl": 0.07990291714668274, + "learning_rate": 1.310606799958122e-07, + "loss": -0.0044, + "num_tokens": 61054853.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3333005905151367, + "sampling/importance_sampling_ratio/mean": 1.0000016689300537, + "sampling/importance_sampling_ratio/min": 0.6799634695053101, + "sampling/sampling_logp_difference/max": 0.38571619987487793, + "sampling/sampling_logp_difference/mean": 0.01756683550775051, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 230.75, + "completions/mean_terminated_length": 230.75, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.40245676040649414, + "epoch": 2.368872549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0135021678291646, + "kl": 0.024305783212184906, + "learning_rate": 1.305802460366615e-07, + "loss": 0.0002, + "num_tokens": 61094357.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5206624269485474, + "sampling/importance_sampling_ratio/mean": 0.9997435808181763, + "sampling/importance_sampling_ratio/min": 0.6622399091720581, + "sampling/sampling_logp_difference/max": 0.4191460609436035, + "sampling/sampling_logp_difference/mean": 0.015357659198343754, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 175.859375, + "completions/mean_terminated_length": 175.859375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3690338730812073, + "epoch": 2.3700980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.204549270967772, + "kl": 0.026068702340126038, + "learning_rate": 1.3010056197620812e-07, + "loss": 0.0464, + "num_tokens": 61125388.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5744441747665405, + "sampling/importance_sampling_ratio/mean": 1.0006462335586548, + "sampling/importance_sampling_ratio/min": 0.5334433913230896, + "sampling/sampling_logp_difference/max": 0.6284023523330688, + "sampling/sampling_logp_difference/mean": 0.015417314134538174, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 221.875, + "completions/mean_terminated_length": 221.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4958120584487915, + "epoch": 2.3713235294117645, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2295254402215419, + "kl": 0.05303645133972168, + "learning_rate": 1.2962162878817985e-07, + "loss": -0.014, + "num_tokens": 61161524.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.396849274635315, + "sampling/importance_sampling_ratio/mean": 1.000116229057312, + "sampling/importance_sampling_ratio/min": 0.7317201495170593, + "sampling/sampling_logp_difference/max": 0.33421921730041504, + "sampling/sampling_logp_difference/mean": 0.016872648149728775, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 221.75, + "completions/mean_terminated_length": 221.75, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.37191271781921387, + "epoch": 2.372549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015527941135002886, + "kl": 0.02844407968223095, + "learning_rate": 1.2914344744478112e-07, + "loss": 0.0003, + "num_tokens": 61194756.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6271814107894897, + "sampling/importance_sampling_ratio/mean": 0.9991934895515442, + "sampling/importance_sampling_ratio/min": 0.6379886865615845, + "sampling/sampling_logp_difference/max": 0.486849308013916, + "sampling/sampling_logp_difference/mean": 0.01516721025109291, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 188.734375, + "completions/mean_terminated_length": 188.734375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.346946656703949, + "epoch": 2.373774509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9941853943369408, + "kl": 0.03440224006772041, + "learning_rate": 1.2866601891668942e-07, + "loss": 0.0366, + "num_tokens": 61223107.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.666730284690857, + "sampling/importance_sampling_ratio/mean": 1.0004414319992065, + "sampling/importance_sampling_ratio/min": 0.5941352844238281, + "sampling/sampling_logp_difference/max": 0.5206482410430908, + "sampling/sampling_logp_difference/mean": 0.01369963027536869, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 160.625, + "completions/mean_terminated_length": 160.625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3266833424568176, + "epoch": 2.375, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02055707743919508, + "kl": 0.023911627009510994, + "learning_rate": 1.2818934417305477e-07, + "loss": 0.0002, + "num_tokens": 61250619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3993620872497559, + "sampling/importance_sampling_ratio/mean": 0.9997545480728149, + "sampling/importance_sampling_ratio/min": 0.17980583012104034, + "sampling/sampling_logp_difference/max": 1.7158777713775635, + "sampling/sampling_logp_difference/mean": 0.014155317097902298, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 235.515625, + "completions/mean_terminated_length": 235.515625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.36854732036590576, + "epoch": 2.376225490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020321860124814524, + "kl": 0.030030731111764908, + "learning_rate": 1.2771342418149656e-07, + "loss": 0.0003, + "num_tokens": 61286748.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.633960485458374, + "sampling/importance_sampling_ratio/mean": 1.0005673170089722, + "sampling/importance_sampling_ratio/min": 0.6923255324363708, + "sampling/sampling_logp_difference/max": 0.49100685119628906, + "sampling/sampling_logp_difference/mean": 0.015137957409024239, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 227.84375, + "completions/mean_terminated_length": 227.84375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.5019451379776001, + "epoch": 2.377450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7251780945530559, + "kl": 0.06397197395563126, + "learning_rate": 1.2723825990810204e-07, + "loss": 0.0009, + "num_tokens": 61320034.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.642146348953247, + "sampling/importance_sampling_ratio/mean": 1.0002970695495605, + "sampling/importance_sampling_ratio/min": 0.6581567525863647, + "sampling/sampling_logp_difference/max": 0.4960041046142578, + "sampling/sampling_logp_difference/mean": 0.016976214945316315, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 176.171875, + "completions/mean_terminated_length": 176.171875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.273092657327652, + "epoch": 2.3786764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01880395412207116, + "kl": 0.021814711391925812, + "learning_rate": 1.2676385231742494e-07, + "loss": 0.0002, + "num_tokens": 61347853.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5767922401428223, + "sampling/importance_sampling_ratio/mean": 1.0000545978546143, + "sampling/importance_sampling_ratio/min": 0.5499488711357117, + "sampling/sampling_logp_difference/max": 0.5979299545288086, + "sampling/sampling_logp_difference/mean": 0.012390416115522385, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 213.078125, + "completions/mean_terminated_length": 213.078125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.31701821088790894, + "epoch": 2.3799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8289956829635279, + "kl": 0.019230041652917862, + "learning_rate": 1.262902023724824e-07, + "loss": 0.0692, + "num_tokens": 61378162.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5289256572723389, + "sampling/importance_sampling_ratio/mean": 1.000627040863037, + "sampling/importance_sampling_ratio/min": 0.6622400879859924, + "sampling/sampling_logp_difference/max": 0.42456531524658203, + "sampling/sampling_logp_difference/mean": 0.011816595681011677, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 217.6875, + "completions/mean_terminated_length": 217.6875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4216040372848511, + "epoch": 2.381127450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7773989763178522, + "kl": 0.03368686884641647, + "learning_rate": 1.258173110347538e-07, + "loss": -0.0106, + "num_tokens": 61421262.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5190861225128174, + "sampling/importance_sampling_ratio/mean": 1.000107765197754, + "sampling/importance_sampling_ratio/min": 0.5889346599578857, + "sampling/sampling_logp_difference/max": 0.5294401049613953, + "sampling/sampling_logp_difference/mean": 0.014785869047045708, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 242.015625, + "completions/mean_terminated_length": 242.015625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.34563833475112915, + "epoch": 2.3823529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9038419975369213, + "kl": 0.022247061133384705, + "learning_rate": 1.253451792641785e-07, + "loss": -0.0065, + "num_tokens": 61456047.0, + "reward": 0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000001192092896, + "sampling/importance_sampling_ratio/min": 0.6332416534423828, + "sampling/sampling_logp_difference/max": 0.8587944507598877, + "sampling/sampling_logp_difference/mean": 0.012755580246448517, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 177.453125, + "completions/mean_terminated_length": 177.453125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.443953275680542, + "epoch": 2.383578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3382090616515316, + "kl": 0.042421214282512665, + "learning_rate": 1.248738080191543e-07, + "loss": 0.0166, + "num_tokens": 61482300.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.631811499595642, + "sampling/importance_sampling_ratio/mean": 0.9994724988937378, + "sampling/importance_sampling_ratio/min": 0.13601350784301758, + "sampling/sampling_logp_difference/max": 1.9950010776519775, + "sampling/sampling_logp_difference/mean": 0.01710696518421173, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 172.765625, + "completions/mean_terminated_length": 172.765625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.35881149768829346, + "epoch": 2.3848039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02606381030151454, + "kl": 0.03052517957985401, + "learning_rate": 1.244031982565349e-07, + "loss": 0.0003, + "num_tokens": 61507949.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4016011953353882, + "sampling/importance_sampling_ratio/mean": 0.9998782873153687, + "sampling/importance_sampling_ratio/min": 0.6277007460594177, + "sampling/sampling_logp_difference/max": 0.46569180488586426, + "sampling/sampling_logp_difference/mean": 0.01494034007191658, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 213.28125, + "completions/mean_terminated_length": 213.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.529669463634491, + "epoch": 2.386029411764706, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5369391712282672, + "kl": 0.10564006865024567, + "learning_rate": 1.239333509316281e-07, + "loss": -0.0258, + "num_tokens": 61542255.0, + "reward": 0.40625, + "reward_std": 0.6205305457115173, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6018822193145752, + "sampling/importance_sampling_ratio/mean": 1.0002837181091309, + "sampling/importance_sampling_ratio/min": 0.655114471912384, + "sampling/sampling_logp_difference/max": 0.4711792469024658, + "sampling/sampling_logp_difference/mean": 0.018138162791728973, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 183.921875, + "completions/mean_terminated_length": 183.921875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3101119101047516, + "epoch": 2.3872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014355034561366454, + "kl": 0.023137640208005905, + "learning_rate": 1.2346426699819456e-07, + "loss": 0.0002, + "num_tokens": 61572538.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5019768476486206, + "sampling/importance_sampling_ratio/mean": 0.9999720454216003, + "sampling/importance_sampling_ratio/min": 0.6942974328994751, + "sampling/sampling_logp_difference/max": 0.4067821502685547, + "sampling/sampling_logp_difference/mean": 0.01327432505786419, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 181.4375, + "completions/mean_terminated_length": 181.4375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.3148180842399597, + "epoch": 2.388480392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01795075784860789, + "kl": 0.024346178397536278, + "learning_rate": 1.2299594740844476e-07, + "loss": 0.0002, + "num_tokens": 61601030.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5028719902038574, + "sampling/importance_sampling_ratio/mean": 0.9996898174285889, + "sampling/importance_sampling_ratio/min": 0.6907265186309814, + "sampling/sampling_logp_difference/max": 0.4073779582977295, + "sampling/sampling_logp_difference/mean": 0.013319061137735844, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 152.546875, + "completions/mean_terminated_length": 152.546875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3306979238986969, + "epoch": 2.389705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016788307033416612, + "kl": 0.02332313545048237, + "learning_rate": 1.225283931130378e-07, + "loss": 0.0002, + "num_tokens": 61625673.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5292854309082031, + "sampling/importance_sampling_ratio/mean": 1.0003693103790283, + "sampling/importance_sampling_ratio/min": 0.6097698211669922, + "sampling/sampling_logp_difference/max": 0.4946737289428711, + "sampling/sampling_logp_difference/mean": 0.01440692599862814, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 156.0, + "completions/mean_terminated_length": 156.0, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.37355828285217285, + "epoch": 2.3909313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6250700743102716, + "kl": 0.04412819445133209, + "learning_rate": 1.220616050610791e-07, + "loss": 0.014, + "num_tokens": 61652585.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3118984699249268, + "sampling/importance_sampling_ratio/mean": 0.9995921850204468, + "sampling/importance_sampling_ratio/min": 0.6029638648033142, + "sampling/sampling_logp_difference/max": 0.5058979988098145, + "sampling/sampling_logp_difference/mean": 0.015270160511136055, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 198.828125, + "completions/mean_terminated_length": 198.828125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.35545986890792847, + "epoch": 2.392156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8277295072089057, + "kl": 0.0282914862036705, + "learning_rate": 1.2159558420011905e-07, + "loss": 0.0111, + "num_tokens": 61683470.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.622059941291809, + "sampling/importance_sampling_ratio/mean": 1.000211477279663, + "sampling/importance_sampling_ratio/min": 0.6415322422981262, + "sampling/sampling_logp_difference/max": 0.48369693756103516, + "sampling/sampling_logp_difference/mean": 0.014084463939070702, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 261.859375, + "completions/mean_terminated_length": 261.859375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.32786688208580017, + "epoch": 2.3933823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01641777539709697, + "kl": 0.021822992712259293, + "learning_rate": 1.2113033147615071e-07, + "loss": 0.0002, + "num_tokens": 61715029.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.475993275642395, + "sampling/importance_sampling_ratio/mean": 1.0002014636993408, + "sampling/importance_sampling_ratio/min": 0.6106759309768677, + "sampling/sampling_logp_difference/max": 0.49318885803222656, + "sampling/sampling_logp_difference/mean": 0.01305415015667677, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 184.328125, + "completions/mean_terminated_length": 184.328125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.32844293117523193, + "epoch": 2.394607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014920561640594215, + "kl": 0.020569054409861565, + "learning_rate": 1.206658478336071e-07, + "loss": 0.0002, + "num_tokens": 61743674.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007356643676758, + "sampling/importance_sampling_ratio/mean": 1.0000507831573486, + "sampling/importance_sampling_ratio/min": 0.685139000415802, + "sampling/sampling_logp_difference/max": 0.47046327590942383, + "sampling/sampling_logp_difference/mean": 0.01349298283457756, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 164.203125, + "completions/mean_terminated_length": 164.203125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.326759397983551, + "epoch": 2.3958333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02062552066755512, + "kl": 0.02400572970509529, + "learning_rate": 1.2020213421536103e-07, + "loss": 0.0002, + "num_tokens": 61771143.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.405951738357544, + "sampling/importance_sampling_ratio/mean": 0.9998986721038818, + "sampling/importance_sampling_ratio/min": 0.6610286831855774, + "sampling/sampling_logp_difference/max": 0.4139580726623535, + "sampling/sampling_logp_difference/mean": 0.013679726049304008, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 197.171875, + "completions/mean_terminated_length": 197.171875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.48854365944862366, + "epoch": 2.3970588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03660998181636321, + "kl": 0.04842720180749893, + "learning_rate": 1.1973919156272138e-07, + "loss": 0.0005, + "num_tokens": 61807042.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4974969625473022, + "sampling/importance_sampling_ratio/mean": 1.0001397132873535, + "sampling/importance_sampling_ratio/min": 0.6678784489631653, + "sampling/sampling_logp_difference/max": 0.4037950038909912, + "sampling/sampling_logp_difference/mean": 0.01671508140861988, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 234.046875, + "completions/mean_terminated_length": 234.046875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.37770476937294006, + "epoch": 2.3982843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7912677326600676, + "kl": 0.025726042687892914, + "learning_rate": 1.1927702081543278e-07, + "loss": 0.0028, + "num_tokens": 61840373.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4759455919265747, + "sampling/importance_sampling_ratio/mean": 0.9998796582221985, + "sampling/importance_sampling_ratio/min": 0.6622360944747925, + "sampling/sampling_logp_difference/max": 0.4121330976486206, + "sampling/sampling_logp_difference/mean": 0.01440503355115652, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 210.703125, + "completions/mean_terminated_length": 210.703125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.44456803798675537, + "epoch": 2.3995098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012359104355091003, + "kl": 0.02415592223405838, + "learning_rate": 1.188156229116724e-07, + "loss": 0.0002, + "num_tokens": 61881938.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.430909514427185, + "sampling/importance_sampling_ratio/mean": 0.9997051954269409, + "sampling/importance_sampling_ratio/min": 0.6149290204048157, + "sampling/sampling_logp_difference/max": 0.4862484931945801, + "sampling/sampling_logp_difference/mean": 0.016197897493839264, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 197.890625, + "completions/mean_terminated_length": 197.890625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3556378185749054, + "epoch": 2.400735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7852415609725907, + "kl": 0.035944290459156036, + "learning_rate": 1.1835499878804861e-07, + "loss": -0.0092, + "num_tokens": 61912859.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.423384666442871, + "sampling/importance_sampling_ratio/mean": 0.99934983253479, + "sampling/importance_sampling_ratio/min": 0.6411640644073486, + "sampling/sampling_logp_difference/max": 0.4444699287414551, + "sampling/sampling_logp_difference/mean": 0.013170383870601654, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 164.453125, + "completions/mean_terminated_length": 164.453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4567108750343323, + "epoch": 2.4019607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027640789248631354, + "kl": 0.045468587428331375, + "learning_rate": 1.1789514937959965e-07, + "loss": 0.0005, + "num_tokens": 61938680.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.500498652458191, + "sampling/importance_sampling_ratio/mean": 1.0003533363342285, + "sampling/importance_sampling_ratio/min": 0.6302643418312073, + "sampling/sampling_logp_difference/max": 0.46161603927612305, + "sampling/sampling_logp_difference/mean": 0.01733510196208954, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 187.921875, + "completions/mean_terminated_length": 187.921875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.4292413890361786, + "epoch": 2.403186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026728092279935153, + "kl": 0.039078257977962494, + "learning_rate": 1.1743607561979013e-07, + "loss": 0.0004, + "num_tokens": 61970131.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4552148580551147, + "sampling/importance_sampling_ratio/mean": 1.0002341270446777, + "sampling/importance_sampling_ratio/min": 0.7300223112106323, + "sampling/sampling_logp_difference/max": 0.3751535415649414, + "sampling/sampling_logp_difference/mean": 0.01613743230700493, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 151.34375, + "completions/mean_terminated_length": 151.34375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.37262940406799316, + "epoch": 2.4044117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021750435024995473, + "kl": 0.03221053257584572, + "learning_rate": 1.1697777844051104e-07, + "loss": 0.0003, + "num_tokens": 61996681.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.349316954612732, + "sampling/importance_sampling_ratio/mean": 0.9994165897369385, + "sampling/importance_sampling_ratio/min": 0.61583012342453, + "sampling/sampling_logp_difference/max": 0.4847841262817383, + "sampling/sampling_logp_difference/mean": 0.015558486804366112, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 217.0, + "completions/mean_terminated_length": 217.0, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.41379979252815247, + "epoch": 2.405637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.905779788821325, + "kl": 0.03606260195374489, + "learning_rate": 1.1652025877207644e-07, + "loss": 0.0066, + "num_tokens": 62027289.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.319873571395874, + "sampling/importance_sampling_ratio/mean": 1.000108003616333, + "sampling/importance_sampling_ratio/min": 0.6361140608787537, + "sampling/sampling_logp_difference/max": 0.4523773193359375, + "sampling/sampling_logp_difference/mean": 0.014538638293743134, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 157.078125, + "completions/mean_terminated_length": 157.078125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3212488293647766, + "epoch": 2.406862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017227515515825284, + "kl": 0.02447613701224327, + "learning_rate": 1.1606351754322247e-07, + "loss": 0.0002, + "num_tokens": 62052430.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5025979280471802, + "sampling/importance_sampling_ratio/mean": 1.0008751153945923, + "sampling/importance_sampling_ratio/min": 0.7386578917503357, + "sampling/sampling_logp_difference/max": 0.4071955680847168, + "sampling/sampling_logp_difference/mean": 0.013703729957342148, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 200.953125, + "completions/mean_terminated_length": 200.953125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4697273373603821, + "epoch": 2.4080882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7855249261291434, + "kl": 0.05025696009397507, + "learning_rate": 1.156075556811048e-07, + "loss": -0.0114, + "num_tokens": 62084747.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4986478090286255, + "sampling/importance_sampling_ratio/mean": 1.0002055168151855, + "sampling/importance_sampling_ratio/min": 0.6056578159332275, + "sampling/sampling_logp_difference/max": 0.5014400482177734, + "sampling/sampling_logp_difference/mean": 0.017576631158590317, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 170.234375, + "completions/mean_terminated_length": 170.234375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.41129642724990845, + "epoch": 2.409313725490196, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4039440656830176, + "kl": 0.034906573593616486, + "learning_rate": 1.1515237411129697e-07, + "loss": 0.0076, + "num_tokens": 62115898.0, + "reward": -0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.4529551267623901, + "sampling/importance_sampling_ratio/mean": 0.9999223947525024, + "sampling/importance_sampling_ratio/min": 0.6937439441680908, + "sampling/sampling_logp_difference/max": 0.3735995292663574, + "sampling/sampling_logp_difference/mean": 0.015991121530532837, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 170.40625, + "completions/mean_terminated_length": 170.40625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3708356022834778, + "epoch": 2.4105392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8072169240429856, + "kl": 0.05858634412288666, + "learning_rate": 1.1469797375778901e-07, + "loss": -0.016, + "num_tokens": 62139604.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999583959579468, + "sampling/importance_sampling_ratio/min": 0.6091251373291016, + "sampling/sampling_logp_difference/max": 1.2542685270309448, + "sampling/sampling_logp_difference/mean": 0.016308307647705078, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 221.3125, + "completions/mean_terminated_length": 221.3125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.41176238656044006, + "epoch": 2.411764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6709462036295988, + "kl": 0.042348649352788925, + "learning_rate": 1.1424435554298473e-07, + "loss": 0.0266, + "num_tokens": 62175224.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.2956719398498535, + "sampling/importance_sampling_ratio/mean": 0.9998412132263184, + "sampling/importance_sampling_ratio/min": 0.6296241879463196, + "sampling/sampling_logp_difference/max": 0.4626321792602539, + "sampling/sampling_logp_difference/mean": 0.014581560157239437, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 237.1875, + "completions/mean_terminated_length": 237.1875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.42337876558303833, + "epoch": 2.4129901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7657342226561513, + "kl": 0.0363505557179451, + "learning_rate": 1.1379152038770029e-07, + "loss": 0.0135, + "num_tokens": 62211652.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.364220142364502, + "sampling/importance_sampling_ratio/mean": 0.9999942183494568, + "sampling/importance_sampling_ratio/min": 0.657956600189209, + "sampling/sampling_logp_difference/max": 0.41861629486083984, + "sampling/sampling_logp_difference/mean": 0.014614572748541832, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 221.296875, + "completions/mean_terminated_length": 221.296875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.42439010739326477, + "epoch": 2.4142156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7412188463840695, + "kl": 0.07795524597167969, + "learning_rate": 1.1333946921116234e-07, + "loss": 0.0112, + "num_tokens": 62241911.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 0.9999186992645264, + "sampling/importance_sampling_ratio/min": 0.6243884563446045, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.015674494206905365, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 191.453125, + "completions/mean_terminated_length": 191.453125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.45214176177978516, + "epoch": 2.4154411764705883, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25250645981119, + "kl": 0.0508386492729187, + "learning_rate": 1.1288820293100637e-07, + "loss": -0.0355, + "num_tokens": 62273572.0, + "reward": 0.375, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.3050446510314941, + "sampling/importance_sampling_ratio/mean": 1.0000816583633423, + "sampling/importance_sampling_ratio/min": 0.6778684258460999, + "sampling/sampling_logp_difference/max": 0.38880205154418945, + "sampling/sampling_logp_difference/mean": 0.01609937846660614, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 178.53125, + "completions/mean_terminated_length": 178.53125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3968961536884308, + "epoch": 2.4166666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1882204520855186, + "kl": 0.026850782334804535, + "learning_rate": 1.1243772246327415e-07, + "loss": 0.0442, + "num_tokens": 62304358.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.364865779876709, + "sampling/importance_sampling_ratio/mean": 1.000064730644226, + "sampling/importance_sampling_ratio/min": 0.6227121353149414, + "sampling/sampling_logp_difference/max": 0.47367095947265625, + "sampling/sampling_logp_difference/mean": 0.015055298805236816, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 227.171875, + "completions/mean_terminated_length": 227.171875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.45749881863594055, + "epoch": 2.417892156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.117696947672644, + "kl": 0.0721966102719307, + "learning_rate": 1.1198802872241242e-07, + "loss": 0.0116, + "num_tokens": 62338257.0, + "reward": 0.75, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6658930778503418, + "sampling/importance_sampling_ratio/mean": 1.0001027584075928, + "sampling/importance_sampling_ratio/min": 0.4650120437145233, + "sampling/sampling_logp_difference/max": 0.7656919956207275, + "sampling/sampling_logp_difference/mean": 0.016596361994743347, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 127.234375, + "completions/mean_terminated_length": 127.234375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.35489314794540405, + "epoch": 2.4191176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037266972411649196, + "kl": 0.032278575003147125, + "learning_rate": 1.1153912262127119e-07, + "loss": 0.0003, + "num_tokens": 62366480.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3228861093521118, + "sampling/importance_sampling_ratio/mean": 0.9997625350952148, + "sampling/importance_sampling_ratio/min": 0.6396545767784119, + "sampling/sampling_logp_difference/max": 0.4468269348144531, + "sampling/sampling_logp_difference/mean": 0.014400172978639603, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 179.515625, + "completions/mean_terminated_length": 179.515625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3122207820415497, + "epoch": 2.420343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9409392365027711, + "kl": 0.02874099276959896, + "learning_rate": 1.1109100507110131e-07, + "loss": -0.0084, + "num_tokens": 62391761.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6055723428726196, + "sampling/importance_sampling_ratio/mean": 1.0000977516174316, + "sampling/importance_sampling_ratio/min": 0.7315836548805237, + "sampling/sampling_logp_difference/max": 0.473480224609375, + "sampling/sampling_logp_difference/mean": 0.01330866850912571, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 199.4375, + "completions/mean_terminated_length": 199.4375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.4175884425640106, + "epoch": 2.4215686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.782914234652367, + "kl": 0.05412402004003525, + "learning_rate": 1.1064367698155303e-07, + "loss": -0.0139, + "num_tokens": 62426445.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5500508546829224, + "sampling/importance_sampling_ratio/mean": 1.0003635883331299, + "sampling/importance_sampling_ratio/min": 0.6406451463699341, + "sampling/sampling_logp_difference/max": 0.445279598236084, + "sampling/sampling_logp_difference/mean": 0.015453522093594074, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 180.59375, + "completions/mean_terminated_length": 180.59375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3654729425907135, + "epoch": 2.422794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3398562328508408, + "kl": 0.04450765997171402, + "learning_rate": 1.1019713926067392e-07, + "loss": -0.0163, + "num_tokens": 62455875.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5971509218215942, + "sampling/importance_sampling_ratio/mean": 1.0005977153778076, + "sampling/importance_sampling_ratio/min": 0.617858350276947, + "sampling/sampling_logp_difference/max": 0.48149609565734863, + "sampling/sampling_logp_difference/mean": 0.01475649606436491, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 242.671875, + "completions/mean_terminated_length": 242.671875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.4201916456222534, + "epoch": 2.424019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7141283259830095, + "kl": 0.03380756080150604, + "learning_rate": 1.0975139281490747e-07, + "loss": -0.0309, + "num_tokens": 62490238.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7181060314178467, + "sampling/importance_sampling_ratio/mean": 1.000054121017456, + "sampling/importance_sampling_ratio/min": 0.631348192691803, + "sampling/sampling_logp_difference/max": 0.5412225723266602, + "sampling/sampling_logp_difference/mean": 0.015032166615128517, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 144.40625, + "completions/mean_terminated_length": 144.40625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3454689383506775, + "epoch": 2.4252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02095865564486938, + "kl": 0.0612105168402195, + "learning_rate": 1.093064385490906e-07, + "loss": 0.0005, + "num_tokens": 62513832.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6124627590179443, + "sampling/importance_sampling_ratio/mean": 1.0005912780761719, + "sampling/importance_sampling_ratio/min": 0.6428762674331665, + "sampling/sampling_logp_difference/max": 0.47776269912719727, + "sampling/sampling_logp_difference/mean": 0.014039501547813416, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 250.140625, + "completions/mean_terminated_length": 250.140625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.4830671548843384, + "epoch": 2.426470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8037283796562461, + "kl": 0.07431580126285553, + "learning_rate": 1.0886227736645215e-07, + "loss": -0.0246, + "num_tokens": 62552561.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001500844955444, + "sampling/importance_sampling_ratio/min": 0.36815980076789856, + "sampling/sampling_logp_difference/max": 0.9992382526397705, + "sampling/sampling_logp_difference/mean": 0.016817739233374596, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 184.71875, + "completions/mean_terminated_length": 184.71875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.44568321108818054, + "epoch": 2.4276960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4118583440996253, + "kl": 0.06490681320428848, + "learning_rate": 1.0841891016861155e-07, + "loss": -0.0211, + "num_tokens": 62584975.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5715563297271729, + "sampling/importance_sampling_ratio/mean": 0.9995582699775696, + "sampling/importance_sampling_ratio/min": 0.487101674079895, + "sampling/sampling_logp_difference/max": 0.7192823886871338, + "sampling/sampling_logp_difference/mean": 0.01702762395143509, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 218.234375, + "completions/mean_terminated_length": 218.234375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.38632649183273315, + "epoch": 2.428921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017640183863070005, + "kl": 0.0258384607732296, + "learning_rate": 1.0797633785557581e-07, + "loss": 0.0003, + "num_tokens": 62620494.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4703099727630615, + "sampling/importance_sampling_ratio/mean": 1.0003032684326172, + "sampling/importance_sampling_ratio/min": 0.6110231280326843, + "sampling/sampling_logp_difference/max": 0.49262046813964844, + "sampling/sampling_logp_difference/mean": 0.014505396597087383, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 177.359375, + "completions/mean_terminated_length": 177.359375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3513790965080261, + "epoch": 2.4301470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018986955667485664, + "kl": 0.02966098114848137, + "learning_rate": 1.0753456132573885e-07, + "loss": 0.0003, + "num_tokens": 62652261.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4397295713424683, + "sampling/importance_sampling_ratio/mean": 0.99994295835495, + "sampling/importance_sampling_ratio/min": 0.6086769700050354, + "sampling/sampling_logp_difference/max": 0.49646759033203125, + "sampling/sampling_logp_difference/mean": 0.015134020708501339, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 198.703125, + "completions/mean_terminated_length": 198.703125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3957688808441162, + "epoch": 2.431372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028757894653507546, + "kl": 0.041085124015808105, + "learning_rate": 1.0709358147587883e-07, + "loss": 0.0005, + "num_tokens": 62684642.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4350110292434692, + "sampling/importance_sampling_ratio/mean": 1.0003424882888794, + "sampling/importance_sampling_ratio/min": 0.6262711882591248, + "sampling/sampling_logp_difference/max": 0.4679718017578125, + "sampling/sampling_logp_difference/mean": 0.015146794728934765, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 240.71875, + "completions/mean_terminated_length": 240.71875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.41249239444732666, + "epoch": 2.4325980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7045451304959446, + "kl": 0.027376476675271988, + "learning_rate": 1.0665339920115718e-07, + "loss": 0.0055, + "num_tokens": 62717248.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4753891229629517, + "sampling/importance_sampling_ratio/mean": 0.9999990463256836, + "sampling/importance_sampling_ratio/min": 0.6014488935470581, + "sampling/sampling_logp_difference/max": 0.5084137916564941, + "sampling/sampling_logp_difference/mean": 0.015139452181756496, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 233.921875, + "completions/mean_terminated_length": 233.921875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.5119118094444275, + "epoch": 2.4338235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0695061553494256, + "kl": 0.044679585844278336, + "learning_rate": 1.0621401539511587e-07, + "loss": 0.0414, + "num_tokens": 62755067.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4380325078964233, + "sampling/importance_sampling_ratio/mean": 0.9998857975006104, + "sampling/importance_sampling_ratio/min": 0.7264178395271301, + "sampling/sampling_logp_difference/max": 0.36327576637268066, + "sampling/sampling_logp_difference/mean": 0.016982797533273697, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 159.125, + "completions/mean_terminated_length": 159.125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.35807543992996216, + "epoch": 2.435049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01767856583666848, + "kl": 0.021159913390874863, + "learning_rate": 1.0577543094967611e-07, + "loss": 0.0002, + "num_tokens": 62782771.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.425788402557373, + "sampling/importance_sampling_ratio/mean": 0.9999739527702332, + "sampling/importance_sampling_ratio/min": 0.6175535917282104, + "sampling/sampling_logp_difference/max": 0.48198938369750977, + "sampling/sampling_logp_difference/mean": 0.014918528497219086, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 194.5, + "completions/mean_terminated_length": 194.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.36983782052993774, + "epoch": 2.436274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017714931439837617, + "kl": 0.027224186807870865, + "learning_rate": 1.053376467551368e-07, + "loss": 0.0003, + "num_tokens": 62812323.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3434499502182007, + "sampling/importance_sampling_ratio/mean": 0.9995107054710388, + "sampling/importance_sampling_ratio/min": 0.6668502688407898, + "sampling/sampling_logp_difference/max": 0.40518975257873535, + "sampling/sampling_logp_difference/mean": 0.014425843022763729, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 177.046875, + "completions/mean_terminated_length": 177.046875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3473891019821167, + "epoch": 2.4375, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8341674998529726, + "kl": 0.037632253021001816, + "learning_rate": 1.0490066370017181e-07, + "loss": 0.0272, + "num_tokens": 62840534.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6463156938552856, + "sampling/importance_sampling_ratio/mean": 0.9990817904472351, + "sampling/importance_sampling_ratio/min": 0.6613803505897522, + "sampling/sampling_logp_difference/max": 0.49853992462158203, + "sampling/sampling_logp_difference/mean": 0.013913518749177456, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 203.328125, + "completions/mean_terminated_length": 203.328125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2740914821624756, + "epoch": 2.438725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01838227270240207, + "kl": 0.023576756939291954, + "learning_rate": 1.044644826718295e-07, + "loss": 0.0002, + "num_tokens": 62876235.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7334914207458496, + "sampling/importance_sampling_ratio/mean": 1.000614881515503, + "sampling/importance_sampling_ratio/min": 0.5954729914665222, + "sampling/sampling_logp_difference/max": 0.5501375198364258, + "sampling/sampling_logp_difference/mean": 0.01161906123161316, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 193.84375, + "completions/mean_terminated_length": 193.84375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.36370471119880676, + "epoch": 2.439950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9301842093189077, + "kl": 0.026500212028622627, + "learning_rate": 1.0402910455552916e-07, + "loss": 0.003, + "num_tokens": 62907009.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.441874384880066, + "sampling/importance_sampling_ratio/mean": 0.999746561050415, + "sampling/importance_sampling_ratio/min": 0.6290517449378967, + "sampling/sampling_logp_difference/max": 0.46354174613952637, + "sampling/sampling_logp_difference/mean": 0.015026746317744255, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 188.90625, + "completions/mean_terminated_length": 188.90625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4158353805541992, + "epoch": 2.4411764705882355, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1963761082120996, + "kl": 0.05945902317762375, + "learning_rate": 1.0359453023506121e-07, + "loss": 0.0167, + "num_tokens": 62934843.0, + "reward": 0.0, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3790603876113892, + "sampling/importance_sampling_ratio/mean": 0.9996902942657471, + "sampling/importance_sampling_ratio/min": 0.7386338114738464, + "sampling/sampling_logp_difference/max": 0.32140231132507324, + "sampling/sampling_logp_difference/mean": 0.015041791833937168, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 178.296875, + "completions/mean_terminated_length": 178.296875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.37770169973373413, + "epoch": 2.4424019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7538945745100293, + "kl": 0.026087483391165733, + "learning_rate": 1.0316076059258389e-07, + "loss": -0.0164, + "num_tokens": 62963758.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4513964653015137, + "sampling/importance_sampling_ratio/mean": 1.0002007484436035, + "sampling/importance_sampling_ratio/min": 0.6932904124259949, + "sampling/sampling_logp_difference/max": 0.3725261688232422, + "sampling/sampling_logp_difference/mean": 0.014406517148017883, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 161.203125, + "completions/mean_terminated_length": 161.203125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.34088271856307983, + "epoch": 2.443627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7765054386984444, + "kl": 0.041137732565402985, + "learning_rate": 1.0272779650862185e-07, + "loss": -0.0014, + "num_tokens": 62993723.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.2942837476730347, + "sampling/importance_sampling_ratio/mean": 0.9998880624771118, + "sampling/importance_sampling_ratio/min": 0.6849300861358643, + "sampling/sampling_logp_difference/max": 0.37843847274780273, + "sampling/sampling_logp_difference/mean": 0.014076050370931625, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 184.90625, + "completions/mean_terminated_length": 184.90625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.514701247215271, + "epoch": 2.4448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9632573636638977, + "kl": 0.039441876113414764, + "learning_rate": 1.0229563886206516e-07, + "loss": 0.0064, + "num_tokens": 63024901.0, + "reward": -0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": -0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.3393256664276123, + "sampling/importance_sampling_ratio/mean": 0.999917209148407, + "sampling/importance_sampling_ratio/min": 0.6539126038551331, + "sampling/sampling_logp_difference/max": 0.42478156089782715, + "sampling/sampling_logp_difference/mean": 0.018140411004424095, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 185.96875, + "completions/mean_terminated_length": 185.96875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3519733250141144, + "epoch": 2.446078431372549, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.963101684896797, + "kl": 0.02421330101788044, + "learning_rate": 1.0186428853016604e-07, + "loss": 0.0044, + "num_tokens": 63058179.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3225116729736328, + "sampling/importance_sampling_ratio/mean": 1.0001866817474365, + "sampling/importance_sampling_ratio/min": 0.6189565062522888, + "sampling/sampling_logp_difference/max": 0.4797203540802002, + "sampling/sampling_logp_difference/mean": 0.013560053892433643, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 173.796875, + "completions/mean_terminated_length": 173.796875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.38833457231521606, + "epoch": 2.4473039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.8695705363505262, + "kl": 0.04325224086642265, + "learning_rate": 1.0143374638853891e-07, + "loss": -0.0217, + "num_tokens": 63085302.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000678300857544, + "sampling/importance_sampling_ratio/min": 0.47059664130210876, + "sampling/sampling_logp_difference/max": 1.3454623222351074, + "sampling/sampling_logp_difference/mean": 0.015392575412988663, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 183.9375, + "completions/mean_terminated_length": 183.9375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.412697970867157, + "epoch": 2.448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7677827256530417, + "kl": 0.057433705776929855, + "learning_rate": 1.0100401331115638e-07, + "loss": 0.012, + "num_tokens": 63115026.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 1.00032377243042, + "sampling/importance_sampling_ratio/min": 0.6919641494750977, + "sampling/sampling_logp_difference/max": 0.42380309104919434, + "sampling/sampling_logp_difference/mean": 0.015239194966852665, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 197.71875, + "completions/mean_terminated_length": 197.71875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.45423510670661926, + "epoch": 2.4497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030658816526145053, + "kl": 0.04963368922472, + "learning_rate": 1.0057509017034977e-07, + "loss": 0.0005, + "num_tokens": 63145728.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.475376844406128, + "sampling/importance_sampling_ratio/mean": 0.9997437000274658, + "sampling/importance_sampling_ratio/min": 0.6212072968482971, + "sampling/sampling_logp_difference/max": 0.4760904312133789, + "sampling/sampling_logp_difference/mean": 0.01706961914896965, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 160.359375, + "completions/mean_terminated_length": 160.359375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.34984079003334045, + "epoch": 2.450980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03902570883154942, + "kl": 0.05480959266424179, + "learning_rate": 1.001469778368057e-07, + "loss": 0.0006, + "num_tokens": 63171479.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002403259277344, + "sampling/importance_sampling_ratio/min": 0.6890142560005188, + "sampling/sampling_logp_difference/max": 1.0377509593963623, + "sampling/sampling_logp_difference/mean": 0.014667082577943802, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 207.65625, + "completions/mean_terminated_length": 207.65625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4067958891391754, + "epoch": 2.452205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.150074053740817, + "kl": 0.030066289007663727, + "learning_rate": 9.971967717956531e-08, + "loss": -0.0486, + "num_tokens": 63212337.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4151383638381958, + "sampling/importance_sampling_ratio/mean": 1.0000475645065308, + "sampling/importance_sampling_ratio/min": 0.6594096422195435, + "sampling/sampling_logp_difference/max": 0.41641032695770264, + "sampling/sampling_logp_difference/mean": 0.015238635241985321, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 207.609375, + "completions/mean_terminated_length": 207.609375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.3727913200855255, + "epoch": 2.4534313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016721451945647513, + "kl": 0.028154637664556503, + "learning_rate": 9.929318906602174e-08, + "loss": 0.0003, + "num_tokens": 63241848.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3162541389465332, + "sampling/importance_sampling_ratio/mean": 1.0001139640808105, + "sampling/importance_sampling_ratio/min": 0.6552072167396545, + "sampling/sampling_logp_difference/max": 0.42280375957489014, + "sampling/sampling_logp_difference/mean": 0.013950793072581291, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 226.6875, + "completions/mean_terminated_length": 226.6875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3771401047706604, + "epoch": 2.454656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014699440635072126, + "kl": 0.02426079846918583, + "learning_rate": 9.886751436191871e-08, + "loss": 0.0002, + "num_tokens": 63276148.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4930033683776855, + "sampling/importance_sampling_ratio/mean": 1.000488519668579, + "sampling/importance_sampling_ratio/min": 0.6954967975616455, + "sampling/sampling_logp_difference/max": 0.400789737701416, + "sampling/sampling_logp_difference/mean": 0.01370636560022831, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 179.84375, + "completions/mean_terminated_length": 179.84375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3870733380317688, + "epoch": 2.4558823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9187841935687155, + "kl": 0.05936279892921448, + "learning_rate": 9.844265393134926e-08, + "loss": 0.0108, + "num_tokens": 63307226.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.797240972518921, + "sampling/importance_sampling_ratio/mean": 1.0002281665802002, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.5862526893615723, + "sampling/sampling_logp_difference/mean": 0.01534139271825552, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 175.421875, + "completions/mean_terminated_length": 175.421875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.400735080242157, + "epoch": 2.457107843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8288882489790806, + "kl": 0.033195994794368744, + "learning_rate": 9.801860863675266e-08, + "loss": -0.0031, + "num_tokens": 63338357.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5534627437591553, + "sampling/importance_sampling_ratio/mean": 0.9995038509368896, + "sampling/importance_sampling_ratio/min": 0.6670553684234619, + "sampling/sampling_logp_difference/max": 0.44048643112182617, + "sampling/sampling_logp_difference/mean": 0.01500217616558075, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 206.5, + "completions/mean_terminated_length": 206.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.43173086643218994, + "epoch": 2.4583333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.14602123466493, + "kl": 0.05417206510901451, + "learning_rate": 9.759537933891421e-08, + "loss": -0.0255, + "num_tokens": 63367765.0, + "reward": 0.59375, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4411839246749878, + "sampling/importance_sampling_ratio/mean": 0.9996616244316101, + "sampling/importance_sampling_ratio/min": 0.6482206583023071, + "sampling/sampling_logp_difference/max": 0.43352413177490234, + "sampling/sampling_logp_difference/mean": 0.016676776111125946, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 180.078125, + "completions/mean_terminated_length": 180.078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.45597320795059204, + "epoch": 2.4595588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.048256222930180066, + "kl": 0.11169981211423874, + "learning_rate": 9.71729668969628e-08, + "loss": 0.0011, + "num_tokens": 63397258.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998218417167664, + "sampling/importance_sampling_ratio/min": 0.6490694880485535, + "sampling/sampling_logp_difference/max": 0.7834055423736572, + "sampling/sampling_logp_difference/mean": 0.017066188156604767, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 175.3125, + "completions/mean_terminated_length": 175.3125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4196290969848633, + "epoch": 2.4607843137254903, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2622915347290284, + "kl": 0.05284261703491211, + "learning_rate": 9.67513721683687e-08, + "loss": 0.0063, + "num_tokens": 63423918.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5551180839538574, + "sampling/importance_sampling_ratio/mean": 0.9994953274726868, + "sampling/importance_sampling_ratio/min": 0.6151927709579468, + "sampling/sampling_logp_difference/max": 0.48581957817077637, + "sampling/sampling_logp_difference/mean": 0.015804724767804146, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 184.03125, + "completions/mean_terminated_length": 184.03125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3441496193408966, + "epoch": 2.4620098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018857326764506976, + "kl": 0.02470124140381813, + "learning_rate": 9.633059600894256e-08, + "loss": 0.0002, + "num_tokens": 63460752.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007341146469116, + "sampling/importance_sampling_ratio/mean": 0.999882698059082, + "sampling/importance_sampling_ratio/min": 0.6460320949554443, + "sampling/sampling_logp_difference/max": 0.4704623222351074, + "sampling/sampling_logp_difference/mean": 0.013898389413952827, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 175.40625, + "completions/mean_terminated_length": 175.40625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.37059569358825684, + "epoch": 2.463235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020059675314963523, + "kl": 0.02793136239051819, + "learning_rate": 9.59106392728331e-08, + "loss": 0.0003, + "num_tokens": 63491914.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.878018856048584, + "sampling/importance_sampling_ratio/mean": 0.9999091625213623, + "sampling/importance_sampling_ratio/min": 0.730019211769104, + "sampling/sampling_logp_difference/max": 0.630217432975769, + "sampling/sampling_logp_difference/mean": 0.014268193393945694, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 191.046875, + "completions/mean_terminated_length": 191.046875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3454241156578064, + "epoch": 2.4644607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01668403844722609, + "kl": 0.02659863792359829, + "learning_rate": 9.549150281252632e-08, + "loss": 0.0003, + "num_tokens": 63526301.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4916746616363525, + "sampling/importance_sampling_ratio/mean": 0.9998441934585571, + "sampling/importance_sampling_ratio/min": 0.4908023476600647, + "sampling/sampling_logp_difference/max": 0.7117137908935547, + "sampling/sampling_logp_difference/mean": 0.014071143232285976, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 224.015625, + "completions/mean_terminated_length": 224.015625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.45142269134521484, + "epoch": 2.465686274509804, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9943780380033651, + "kl": 0.053271468728780746, + "learning_rate": 9.507318747884241e-08, + "loss": -0.0399, + "num_tokens": 63560734.0, + "reward": 0.125, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.9499462842941284, + "sampling/importance_sampling_ratio/mean": 0.9997901320457458, + "sampling/importance_sampling_ratio/min": 0.6057900190353394, + "sampling/sampling_logp_difference/max": 0.6678018569946289, + "sampling/sampling_logp_difference/mean": 0.015925079584121704, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 204.546875, + "completions/mean_terminated_length": 204.546875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.38158512115478516, + "epoch": 2.4669117647058822, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.199859856240969, + "kl": 0.049633897840976715, + "learning_rate": 9.465569412093488e-08, + "loss": 0.0155, + "num_tokens": 63589393.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4727635383605957, + "sampling/importance_sampling_ratio/mean": 0.9995374083518982, + "sampling/importance_sampling_ratio/min": 0.624771773815155, + "sampling/sampling_logp_difference/max": 0.4703688621520996, + "sampling/sampling_logp_difference/mean": 0.014872990548610687, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 233.03125, + "completions/mean_terminated_length": 233.03125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3444235324859619, + "epoch": 2.468137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6450744790665558, + "kl": 0.026743315160274506, + "learning_rate": 9.423902358628916e-08, + "loss": 0.0071, + "num_tokens": 63629219.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5467276573181152, + "sampling/importance_sampling_ratio/mean": 1.0003172159194946, + "sampling/importance_sampling_ratio/min": 0.6140531897544861, + "sampling/sampling_logp_difference/max": 0.4876737594604492, + "sampling/sampling_logp_difference/mean": 0.013777516782283783, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 201.46875, + "completions/mean_terminated_length": 201.46875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3421174883842468, + "epoch": 2.469362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8465805260813877, + "kl": 0.03840591758489609, + "learning_rate": 9.382317672071966e-08, + "loss": 0.0088, + "num_tokens": 63655601.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5356110334396362, + "sampling/importance_sampling_ratio/mean": 0.9997233152389526, + "sampling/importance_sampling_ratio/min": 0.6057863831520081, + "sampling/sampling_logp_difference/max": 0.501227855682373, + "sampling/sampling_logp_difference/mean": 0.01357186958193779, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 225.375, + "completions/mean_terminated_length": 225.375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.42109107971191406, + "epoch": 2.4705882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0802098526469588, + "kl": 0.035182803869247437, + "learning_rate": 9.340815436836963e-08, + "loss": -0.0328, + "num_tokens": 63688569.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6001441478729248, + "sampling/importance_sampling_ratio/mean": 0.9997341632843018, + "sampling/importance_sampling_ratio/min": 0.4194274842739105, + "sampling/sampling_logp_difference/max": 0.8688646554946899, + "sampling/sampling_logp_difference/mean": 0.016411174088716507, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 258.828125, + "completions/mean_terminated_length": 258.828125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.3673095703125, + "epoch": 2.471813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7011740149640728, + "kl": 0.024489955976605415, + "learning_rate": 9.299395737170757e-08, + "loss": -0.0232, + "num_tokens": 63722350.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4139859676361084, + "sampling/importance_sampling_ratio/mean": 0.9997345805168152, + "sampling/importance_sampling_ratio/min": 0.6033496260643005, + "sampling/sampling_logp_difference/max": 0.5052584409713745, + "sampling/sampling_logp_difference/mean": 0.013786962255835533, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 159.4375, + "completions/mean_terminated_length": 159.4375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.34036415815353394, + "epoch": 2.4730392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023419569378104955, + "kl": 0.025814056396484375, + "learning_rate": 9.258058657152761e-08, + "loss": 0.0003, + "num_tokens": 63749866.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5875335931777954, + "sampling/importance_sampling_ratio/mean": 1.0000228881835938, + "sampling/importance_sampling_ratio/min": 0.641847550868988, + "sampling/sampling_logp_difference/max": 0.46218156814575195, + "sampling/sampling_logp_difference/mean": 0.014968582428991795, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 207.390625, + "completions/mean_terminated_length": 207.390625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.37217777967453003, + "epoch": 2.474264705882353, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3054132438275574, + "kl": 0.035565003752708435, + "learning_rate": 9.216804280694612e-08, + "loss": -0.0938, + "num_tokens": 63779571.0, + "reward": -0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6907960176467896, + "sampling/importance_sampling_ratio/mean": 1.000901460647583, + "sampling/importance_sampling_ratio/min": 0.6325021386146545, + "sampling/sampling_logp_difference/max": 0.5251994132995605, + "sampling/sampling_logp_difference/mean": 0.015001806430518627, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 199.125, + "completions/mean_terminated_length": 199.125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3852364122867584, + "epoch": 2.4754901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016064194865100398, + "kl": 0.02930980920791626, + "learning_rate": 9.175632691540064e-08, + "loss": 0.0003, + "num_tokens": 63813435.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5754631757736206, + "sampling/importance_sampling_ratio/mean": 1.0003750324249268, + "sampling/importance_sampling_ratio/min": 0.6259395480155945, + "sampling/sampling_logp_difference/max": 0.46850156784057617, + "sampling/sampling_logp_difference/mean": 0.014602059498429298, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 131.03125, + "completions/mean_terminated_length": 131.03125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3446516692638397, + "epoch": 2.4767156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0883625505459922, + "kl": 0.04013896360993385, + "learning_rate": 9.134543973264868e-08, + "loss": 0.0209, + "num_tokens": 63833261.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5031681060791016, + "sampling/importance_sampling_ratio/mean": 1.0001024007797241, + "sampling/importance_sampling_ratio/min": 0.6957728862762451, + "sampling/sampling_logp_difference/max": 0.40757501125335693, + "sampling/sampling_logp_difference/mean": 0.015270713716745377, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 224.078125, + "completions/mean_terminated_length": 224.078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.4325482249259949, + "epoch": 2.4779411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7694334140558714, + "kl": 0.06226831674575806, + "learning_rate": 9.093538209276486e-08, + "loss": -0.0138, + "num_tokens": 63863330.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3806134462356567, + "sampling/importance_sampling_ratio/mean": 1.0003423690795898, + "sampling/importance_sampling_ratio/min": 0.6546946167945862, + "sampling/sampling_logp_difference/max": 0.423586368560791, + "sampling/sampling_logp_difference/mean": 0.015872148796916008, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 237.34375, + "completions/mean_terminated_length": 237.34375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3297128975391388, + "epoch": 2.4791666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6557936677120535, + "kl": 0.0265534445643425, + "learning_rate": 9.052615482814069e-08, + "loss": 0.0173, + "num_tokens": 63901064.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.2983318567276, + "sampling/importance_sampling_ratio/mean": 0.9998232126235962, + "sampling/importance_sampling_ratio/min": 0.6395631432533264, + "sampling/sampling_logp_difference/max": 0.44696998596191406, + "sampling/sampling_logp_difference/mean": 0.011355316266417503, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 200.953125, + "completions/mean_terminated_length": 200.953125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4074874520301819, + "epoch": 2.480392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0673112340216897, + "kl": 0.0918905958533287, + "learning_rate": 9.011775876948096e-08, + "loss": 0.0229, + "num_tokens": 63928997.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4352679252624512, + "sampling/importance_sampling_ratio/mean": 1.0004394054412842, + "sampling/importance_sampling_ratio/min": 0.6043077707290649, + "sampling/sampling_logp_difference/max": 0.5036716461181641, + "sampling/sampling_logp_difference/mean": 0.015174496918916702, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 181.1875, + "completions/mean_terminated_length": 181.1875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.39710932970046997, + "epoch": 2.4816176470588234, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.6652212529846395, + "kl": 0.05999930948019028, + "learning_rate": 8.971019474580427e-08, + "loss": -0.0612, + "num_tokens": 63954081.0, + "reward": 0.1875, + "reward_std": 0.6525881886482239, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.2878774404525757, + "sampling/importance_sampling_ratio/mean": 0.9994023442268372, + "sampling/importance_sampling_ratio/min": 0.6396613121032715, + "sampling/sampling_logp_difference/max": 0.44681644439697266, + "sampling/sampling_logp_difference/mean": 0.016382107511162758, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 179.9375, + "completions/mean_terminated_length": 179.9375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4748602509498596, + "epoch": 2.482843137254902, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4228785543022433, + "kl": 0.0473778061568737, + "learning_rate": 8.930346358443953e-08, + "loss": -0.0277, + "num_tokens": 63980269.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.320014238357544, + "sampling/importance_sampling_ratio/mean": 0.9999608993530273, + "sampling/importance_sampling_ratio/min": 0.6395493745803833, + "sampling/sampling_logp_difference/max": 0.4469914436340332, + "sampling/sampling_logp_difference/mean": 0.01677963137626648, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 195.46875, + "completions/mean_terminated_length": 195.46875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.45336729288101196, + "epoch": 2.4840686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05164388911331413, + "kl": 0.0730249434709549, + "learning_rate": 8.889756611102539e-08, + "loss": 0.0008, + "num_tokens": 64007307.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6210100650787354, + "sampling/importance_sampling_ratio/mean": 1.0003575086593628, + "sampling/importance_sampling_ratio/min": 0.6512575149536133, + "sampling/sampling_logp_difference/max": 0.4830493927001953, + "sampling/sampling_logp_difference/mean": 0.016209498047828674, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 192.90625, + "completions/mean_terminated_length": 192.90625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3370332717895508, + "epoch": 2.485294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014438929385510302, + "kl": 0.027498988434672356, + "learning_rate": 8.84925031495079e-08, + "loss": 0.0003, + "num_tokens": 64036853.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4217779636383057, + "sampling/importance_sampling_ratio/mean": 1.0000685453414917, + "sampling/importance_sampling_ratio/min": 0.5487043857574463, + "sampling/sampling_logp_difference/max": 0.6001954078674316, + "sampling/sampling_logp_difference/mean": 0.013127630576491356, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 171.5625, + "completions/mean_terminated_length": 171.5625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4325448274612427, + "epoch": 2.486519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2946288954815135, + "kl": 0.043557170778512955, + "learning_rate": 8.808827552213916e-08, + "loss": -0.0239, + "num_tokens": 64062217.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.8101012706756592, + "sampling/importance_sampling_ratio/mean": 1.0000345706939697, + "sampling/importance_sampling_ratio/min": 0.6701188087463379, + "sampling/sampling_logp_difference/max": 0.5933828353881836, + "sampling/sampling_logp_difference/mean": 0.017916850745677948, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 153.078125, + "completions/mean_terminated_length": 153.078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.39488697052001953, + "epoch": 2.4877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02236514120023166, + "kl": 0.0324961319565773, + "learning_rate": 8.768488404947593e-08, + "loss": 0.0003, + "num_tokens": 64088398.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4411685466766357, + "sampling/importance_sampling_ratio/mean": 0.9999464750289917, + "sampling/importance_sampling_ratio/min": 0.704010546207428, + "sampling/sampling_logp_difference/max": 0.36545419692993164, + "sampling/sampling_logp_difference/mean": 0.016623370349407196, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 212.53125, + "completions/mean_terminated_length": 212.53125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3667662739753723, + "epoch": 2.488970588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020995587781656774, + "kl": 0.04251813888549805, + "learning_rate": 8.728232955037696e-08, + "loss": 0.0004, + "num_tokens": 64119408.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5746550559997559, + "sampling/importance_sampling_ratio/mean": 0.9998937845230103, + "sampling/importance_sampling_ratio/min": 0.6589528918266296, + "sampling/sampling_logp_difference/max": 0.45403623580932617, + "sampling/sampling_logp_difference/mean": 0.014021791517734528, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 196.828125, + "completions/mean_terminated_length": 196.828125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.33141422271728516, + "epoch": 2.4901960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016957718744736568, + "kl": 0.024404536932706833, + "learning_rate": 8.688061284200265e-08, + "loss": 0.0002, + "num_tokens": 64150885.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3832037448883057, + "sampling/importance_sampling_ratio/mean": 0.9999232888221741, + "sampling/importance_sampling_ratio/min": 0.7788761854171753, + "sampling/sampling_logp_difference/max": 0.3244023323059082, + "sampling/sampling_logp_difference/mean": 0.013108965009450912, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 239.671875, + "completions/mean_terminated_length": 239.671875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.47033601999282837, + "epoch": 2.491421568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7357835453529568, + "kl": 0.07376972585916519, + "learning_rate": 8.647973473981224e-08, + "loss": -0.0187, + "num_tokens": 64185872.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.400166630744934, + "sampling/importance_sampling_ratio/mean": 0.9998288154602051, + "sampling/importance_sampling_ratio/min": 0.6527488827705383, + "sampling/sampling_logp_difference/max": 0.4265627861022949, + "sampling/sampling_logp_difference/mean": 0.015554027631878853, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 181.96875, + "completions/mean_terminated_length": 181.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.40833908319473267, + "epoch": 2.4926470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027525734351921034, + "kl": 0.035365961492061615, + "learning_rate": 8.607969605756315e-08, + "loss": 0.0004, + "num_tokens": 64215886.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4768397808074951, + "sampling/importance_sampling_ratio/mean": 1.0000439882278442, + "sampling/importance_sampling_ratio/min": 0.6260562539100647, + "sampling/sampling_logp_difference/max": 0.46831512451171875, + "sampling/sampling_logp_difference/mean": 0.015282193198800087, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 211.578125, + "completions/mean_terminated_length": 211.578125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4208946228027344, + "epoch": 2.493872549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017929340389628883, + "kl": 0.02399945817887783, + "learning_rate": 8.568049760730838e-08, + "loss": 0.0002, + "num_tokens": 64250579.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.515325903892517, + "sampling/importance_sampling_ratio/mean": 0.9996589422225952, + "sampling/importance_sampling_ratio/min": 0.6436850428581238, + "sampling/sampling_logp_difference/max": 0.44054579734802246, + "sampling/sampling_logp_difference/mean": 0.016491299495100975, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 207.796875, + "completions/mean_terminated_length": 207.796875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.466720312833786, + "epoch": 2.4950980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1012710581858423, + "kl": 0.05975695699453354, + "learning_rate": 8.52821401993955e-08, + "loss": 0.0114, + "num_tokens": 64282758.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5589814186096191, + "sampling/importance_sampling_ratio/mean": 1.0004221200942993, + "sampling/importance_sampling_ratio/min": 0.37357693910598755, + "sampling/sampling_logp_difference/max": 0.9846312999725342, + "sampling/sampling_logp_difference/mean": 0.016525056213140488, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 177.65625, + "completions/mean_terminated_length": 177.65625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.36008650064468384, + "epoch": 2.4963235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01835023940067309, + "kl": 0.02421746775507927, + "learning_rate": 8.488462464246493e-08, + "loss": 0.0002, + "num_tokens": 64314416.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5008811950683594, + "sampling/importance_sampling_ratio/mean": 0.999953031539917, + "sampling/importance_sampling_ratio/min": 0.6886656284332275, + "sampling/sampling_logp_difference/max": 0.4060523509979248, + "sampling/sampling_logp_difference/mean": 0.015186588279902935, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 155.78125, + "completions/mean_terminated_length": 155.78125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3260124921798706, + "epoch": 2.497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031217136219159305, + "kl": 0.03108660690486431, + "learning_rate": 8.448795174344803e-08, + "loss": 0.0003, + "num_tokens": 64341890.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4375299215316772, + "sampling/importance_sampling_ratio/mean": 1.0002895593643188, + "sampling/importance_sampling_ratio/min": 0.6622427105903625, + "sampling/sampling_logp_difference/max": 0.4121232032775879, + "sampling/sampling_logp_difference/mean": 0.01380416564643383, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 172.9375, + "completions/mean_terminated_length": 172.9375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3441891372203827, + "epoch": 2.498774509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020790095945366873, + "kl": 0.03097061812877655, + "learning_rate": 8.409212230756563e-08, + "loss": 0.0003, + "num_tokens": 64367470.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4096341133117676, + "sampling/importance_sampling_ratio/mean": 1.0000736713409424, + "sampling/importance_sampling_ratio/min": 0.5910730957984924, + "sampling/sampling_logp_difference/max": 0.5258156061172485, + "sampling/sampling_logp_difference/mean": 0.014658539555966854, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 219.296875, + "completions/mean_terminated_length": 219.296875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3873305916786194, + "epoch": 2.5, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8664369968312178, + "kl": 0.035865385085344315, + "learning_rate": 8.369713713832622e-08, + "loss": 0.0096, + "num_tokens": 64400961.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4710254669189453, + "sampling/importance_sampling_ratio/mean": 1.0000736713409424, + "sampling/importance_sampling_ratio/min": 0.6631979942321777, + "sampling/sampling_logp_difference/max": 0.41068172454833984, + "sampling/sampling_logp_difference/mean": 0.013265905901789665, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 197.734375, + "completions/mean_terminated_length": 197.734375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4702549874782562, + "epoch": 2.501225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9499653475974037, + "kl": 0.07387121766805649, + "learning_rate": 8.330299703752497e-08, + "loss": 0.0165, + "num_tokens": 64434496.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5756784677505493, + "sampling/importance_sampling_ratio/mean": 1.0000019073486328, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4679851531982422, + "sampling/sampling_logp_difference/mean": 0.016622673720121384, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 181.453125, + "completions/mean_terminated_length": 181.453125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.41390663385391235, + "epoch": 2.502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8451944335942622, + "kl": 0.05394618213176727, + "learning_rate": 8.290970280524124e-08, + "loss": -0.0286, + "num_tokens": 64460941.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6247601509094238, + "sampling/importance_sampling_ratio/mean": 0.9998599886894226, + "sampling/importance_sampling_ratio/min": 0.5128727555274963, + "sampling/sampling_logp_difference/max": 0.6677275896072388, + "sampling/sampling_logp_difference/mean": 0.016559142619371414, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 148.09375, + "completions/mean_terminated_length": 148.09375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3281470835208893, + "epoch": 2.5036764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018593121964325745, + "kl": 0.02637426368892193, + "learning_rate": 8.251725523983722e-08, + "loss": 0.0003, + "num_tokens": 64485955.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6862460374832153, + "sampling/importance_sampling_ratio/mean": 1.000045657157898, + "sampling/importance_sampling_ratio/min": 0.6827670335769653, + "sampling/sampling_logp_difference/max": 0.5225048065185547, + "sampling/sampling_logp_difference/mean": 0.015443524345755577, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 223.96875, + "completions/mean_terminated_length": 223.96875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4159507155418396, + "epoch": 2.5049019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3564581012158, + "kl": 0.048696912825107574, + "learning_rate": 8.212565513795683e-08, + "loss": -0.0395, + "num_tokens": 64520513.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6581302881240845, + "sampling/importance_sampling_ratio/mean": 1.0005327463150024, + "sampling/importance_sampling_ratio/min": 0.6661763787269592, + "sampling/sampling_logp_difference/max": 0.5056905746459961, + "sampling/sampling_logp_difference/mean": 0.015493962913751602, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 197.84375, + "completions/mean_terminated_length": 197.84375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3104378879070282, + "epoch": 2.506127450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013666259204105519, + "kl": 0.019603494554758072, + "learning_rate": 8.173490329452343e-08, + "loss": 0.0002, + "num_tokens": 64551111.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4720368385314941, + "sampling/importance_sampling_ratio/mean": 0.9996908903121948, + "sampling/importance_sampling_ratio/min": 0.7729811668395996, + "sampling/sampling_logp_difference/max": 0.38664698600769043, + "sampling/sampling_logp_difference/mean": 0.013405261561274529, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 219.296875, + "completions/mean_terminated_length": 219.296875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3943178057670593, + "epoch": 2.5073529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01440045804750558, + "kl": 0.023328255861997604, + "learning_rate": 8.13450005027384e-08, + "loss": 0.0002, + "num_tokens": 64582458.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.629729151725769, + "sampling/importance_sampling_ratio/mean": 0.9995909929275513, + "sampling/importance_sampling_ratio/min": 0.6272247433662415, + "sampling/sampling_logp_difference/max": 0.48841381072998047, + "sampling/sampling_logp_difference/mean": 0.0153773482888937, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 212.671875, + "completions/mean_terminated_length": 212.671875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.36984264850616455, + "epoch": 2.508578431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1698487761148253, + "kl": 0.03360015153884888, + "learning_rate": 8.09559475540797e-08, + "loss": 0.0172, + "num_tokens": 64614261.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7476072311401367, + "sampling/importance_sampling_ratio/mean": 0.9999226927757263, + "sampling/importance_sampling_ratio/min": 0.48961153626441956, + "sampling/sampling_logp_difference/max": 0.7141430377960205, + "sampling/sampling_logp_difference/mean": 0.015418700873851776, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 182.46875, + "completions/mean_terminated_length": 182.46875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.32081127166748047, + "epoch": 2.5098039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017766140029201693, + "kl": 0.023535825312137604, + "learning_rate": 8.056774523830029e-08, + "loss": 0.0002, + "num_tokens": 64639251.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4059449434280396, + "sampling/importance_sampling_ratio/mean": 1.0003461837768555, + "sampling/importance_sampling_ratio/min": 0.6841716766357422, + "sampling/sampling_logp_difference/max": 0.3795464038848877, + "sampling/sampling_logp_difference/mean": 0.013239240273833275, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 195.65625, + "completions/mean_terminated_length": 195.65625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3986157178878784, + "epoch": 2.5110294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015950740997979006, + "kl": 0.02680913172662258, + "learning_rate": 8.018039434342627e-08, + "loss": 0.0003, + "num_tokens": 64669229.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7550904750823975, + "sampling/importance_sampling_ratio/mean": 0.9998959898948669, + "sampling/importance_sampling_ratio/min": 0.6712509393692017, + "sampling/sampling_logp_difference/max": 0.5625203847885132, + "sampling/sampling_logp_difference/mean": 0.01490098051726818, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 207.90625, + "completions/mean_terminated_length": 207.90625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.39417797327041626, + "epoch": 2.5122549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9804432327070892, + "kl": 0.03181571513414383, + "learning_rate": 7.979389565575522e-08, + "loss": -0.0376, + "num_tokens": 64704519.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6269478797912598, + "sampling/importance_sampling_ratio/mean": 0.9998840689659119, + "sampling/importance_sampling_ratio/min": 0.6305050849914551, + "sampling/sampling_logp_difference/max": 0.4867057800292969, + "sampling/sampling_logp_difference/mean": 0.013905135914683342, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 201.8125, + "completions/mean_terminated_length": 201.8125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.39764657616615295, + "epoch": 2.513480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.009551746613855, + "kl": 0.06586883962154388, + "learning_rate": 7.940824995985528e-08, + "loss": 0.0059, + "num_tokens": 64733723.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5073206424713135, + "sampling/importance_sampling_ratio/mean": 1.000825047492981, + "sampling/importance_sampling_ratio/min": 0.6488670706748962, + "sampling/sampling_logp_difference/max": 0.43252742290496826, + "sampling/sampling_logp_difference/mean": 0.01538984663784504, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 199.421875, + "completions/mean_terminated_length": 199.421875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3452928066253662, + "epoch": 2.514705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01763087271025228, + "kl": 0.024976499378681183, + "learning_rate": 7.902345803856264e-08, + "loss": 0.0002, + "num_tokens": 64766246.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.313920259475708, + "sampling/importance_sampling_ratio/mean": 1.0007414817810059, + "sampling/importance_sampling_ratio/min": 0.6838679313659668, + "sampling/sampling_logp_difference/max": 0.37999045848846436, + "sampling/sampling_logp_difference/mean": 0.01332725677639246, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 205.84375, + "completions/mean_terminated_length": 205.84375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.28712159395217896, + "epoch": 2.5159313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012941324844786961, + "kl": 0.014969379641115665, + "learning_rate": 7.863952067298041e-08, + "loss": 0.0001, + "num_tokens": 64797692.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5468515157699585, + "sampling/importance_sampling_ratio/mean": 1.0001466274261475, + "sampling/importance_sampling_ratio/min": 0.5197044610977173, + "sampling/sampling_logp_difference/max": 0.6544950008392334, + "sampling/sampling_logp_difference/mean": 0.012371763586997986, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 208.4375, + "completions/mean_terminated_length": 208.4375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.5194035768508911, + "epoch": 2.517156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8939359137696472, + "kl": 0.06294498592615128, + "learning_rate": 7.825643864247733e-08, + "loss": -0.0189, + "num_tokens": 64834056.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.326941728591919, + "sampling/importance_sampling_ratio/mean": 1.0005096197128296, + "sampling/importance_sampling_ratio/min": 0.7042613625526428, + "sampling/sampling_logp_difference/max": 0.35060572624206543, + "sampling/sampling_logp_difference/mean": 0.017616376280784607, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 181.828125, + "completions/mean_terminated_length": 181.828125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.35135945677757263, + "epoch": 2.5183823529411766, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2998194419705107, + "kl": 0.037417348474264145, + "learning_rate": 7.787421272468547e-08, + "loss": 0.0299, + "num_tokens": 64865501.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4126293659210205, + "sampling/importance_sampling_ratio/mean": 0.9993284344673157, + "sampling/importance_sampling_ratio/min": 0.6157500743865967, + "sampling/sampling_logp_difference/max": 0.48491406440734863, + "sampling/sampling_logp_difference/mean": 0.014803480356931686, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 185.59375, + "completions/mean_terminated_length": 185.59375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.4694395661354065, + "epoch": 2.519607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.944606688946615, + "kl": 0.045927416533231735, + "learning_rate": 7.749284369549952e-08, + "loss": -0.0029, + "num_tokens": 64892723.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3966535329818726, + "sampling/importance_sampling_ratio/mean": 0.9997533559799194, + "sampling/importance_sampling_ratio/min": 0.6684202551841736, + "sampling/sampling_logp_difference/max": 0.40283823013305664, + "sampling/sampling_logp_difference/mean": 0.01650446094572544, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 181.265625, + "completions/mean_terminated_length": 181.265625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.48010629415512085, + "epoch": 2.5208333333333335, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.652227345604709, + "kl": 0.05787282437086105, + "learning_rate": 7.711233232907399e-08, + "loss": 0.0598, + "num_tokens": 64922260.0, + "reward": 0.59375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000523567199707, + "sampling/importance_sampling_ratio/min": 0.6213348507881165, + "sampling/sampling_logp_difference/max": 0.996757984161377, + "sampling/sampling_logp_difference/mean": 0.017763352021574974, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 209.0625, + "completions/mean_terminated_length": 209.0625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4046059250831604, + "epoch": 2.5220588235294117, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2512937563054876, + "kl": 0.03622438386082649, + "learning_rate": 7.673267939782324e-08, + "loss": 0.0188, + "num_tokens": 64957576.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6640219688415527, + "sampling/importance_sampling_ratio/mean": 1.0006648302078247, + "sampling/importance_sampling_ratio/min": 0.690620481967926, + "sampling/sampling_logp_difference/max": 0.50923752784729, + "sampling/sampling_logp_difference/mean": 0.014391078613698483, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 219.25, + "completions/mean_terminated_length": 219.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.33976441621780396, + "epoch": 2.5232843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013465433339745361, + "kl": 0.020324591547250748, + "learning_rate": 7.63538856724184e-08, + "loss": 0.0002, + "num_tokens": 64992968.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7948695421218872, + "sampling/importance_sampling_ratio/mean": 1.0009374618530273, + "sampling/importance_sampling_ratio/min": 0.5868720412254333, + "sampling/sampling_logp_difference/max": 0.5849323272705078, + "sampling/sampling_logp_difference/mean": 0.014407450333237648, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 200.375, + "completions/mean_terminated_length": 200.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.40805691480636597, + "epoch": 2.5245098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8261104427261089, + "kl": 0.028869224712252617, + "learning_rate": 7.597595192178702e-08, + "loss": -0.0153, + "num_tokens": 65022128.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5972583293914795, + "sampling/importance_sampling_ratio/mean": 0.9998906254768372, + "sampling/importance_sampling_ratio/min": 0.6615021228790283, + "sampling/sampling_logp_difference/max": 0.4682886600494385, + "sampling/sampling_logp_difference/mean": 0.015856822952628136, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 181.359375, + "completions/mean_terminated_length": 181.359375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.37535834312438965, + "epoch": 2.525735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0106305715428294, + "kl": 0.054244644939899445, + "learning_rate": 7.559887891311046e-08, + "loss": 0.0058, + "num_tokens": 65049015.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.562191367149353, + "sampling/importance_sampling_ratio/mean": 0.9995304346084595, + "sampling/importance_sampling_ratio/min": 0.6192928552627563, + "sampling/sampling_logp_difference/max": 0.47917699813842773, + "sampling/sampling_logp_difference/mean": 0.01474264170974493, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 190.734375, + "completions/mean_terminated_length": 190.734375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4425475597381592, + "epoch": 2.5269607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0163932395417137, + "kl": 0.02658260613679886, + "learning_rate": 7.522266741182303e-08, + "loss": 0.0202, + "num_tokens": 65085830.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6160290241241455, + "sampling/importance_sampling_ratio/mean": 0.9997236132621765, + "sampling/importance_sampling_ratio/min": 0.6990983486175537, + "sampling/sampling_logp_difference/max": 0.47997188568115234, + "sampling/sampling_logp_difference/mean": 0.015597082674503326, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 211.28125, + "completions/mean_terminated_length": 211.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4401535391807556, + "epoch": 2.528186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8937973539372797, + "kl": 0.061278894543647766, + "learning_rate": 7.484731818161049e-08, + "loss": 0.0284, + "num_tokens": 65113880.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.822419285774231, + "sampling/importance_sampling_ratio/mean": 0.9997953772544861, + "sampling/importance_sampling_ratio/min": 0.4811238646507263, + "sampling/sampling_logp_difference/max": 0.7316305637359619, + "sampling/sampling_logp_difference/mean": 0.015561332926154137, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 159.34375, + "completions/mean_terminated_length": 159.34375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.4249591827392578, + "epoch": 2.5294117647058822, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0349410570256068, + "kl": 0.04541175812482834, + "learning_rate": 7.447283198440763e-08, + "loss": -0.0051, + "num_tokens": 65138686.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.3719853162765503, + "sampling/importance_sampling_ratio/mean": 0.9995375871658325, + "sampling/importance_sampling_ratio/min": 0.48112422227859497, + "sampling/sampling_logp_difference/max": 0.7316298484802246, + "sampling/sampling_logp_difference/mean": 0.016625817865133286, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 214.25, + "completions/mean_terminated_length": 214.25, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.3495437502861023, + "epoch": 2.530637254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.077871423935281, + "kl": 0.02568657323718071, + "learning_rate": 7.409920958039794e-08, + "loss": 0.012, + "num_tokens": 65177006.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.609712839126587, + "sampling/importance_sampling_ratio/mean": 0.9997337460517883, + "sampling/importance_sampling_ratio/min": 0.6248358488082886, + "sampling/sampling_logp_difference/max": 0.4760558605194092, + "sampling/sampling_logp_difference/mean": 0.014553414657711983, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 185.546875, + "completions/mean_terminated_length": 185.546875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.36232730746269226, + "epoch": 2.531862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01971203032382966, + "kl": 0.037130288779735565, + "learning_rate": 7.372645172801112e-08, + "loss": 0.0004, + "num_tokens": 65205921.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4028819799423218, + "sampling/importance_sampling_ratio/mean": 0.9995936155319214, + "sampling/importance_sampling_ratio/min": 0.6272878050804138, + "sampling/sampling_logp_difference/max": 0.46634984016418457, + "sampling/sampling_logp_difference/mean": 0.014208411797881126, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 201.359375, + "completions/mean_terminated_length": 201.359375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.43340033292770386, + "epoch": 2.5330882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8205182366007592, + "kl": 0.03927885368466377, + "learning_rate": 7.335455918392219e-08, + "loss": 0.0102, + "num_tokens": 65239512.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.379061222076416, + "sampling/importance_sampling_ratio/mean": 1.0003657341003418, + "sampling/importance_sampling_ratio/min": 0.6418119072914124, + "sampling/sampling_logp_difference/max": 0.44345998764038086, + "sampling/sampling_logp_difference/mean": 0.01631690375506878, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 206.796875, + "completions/mean_terminated_length": 206.796875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3557499051094055, + "epoch": 2.534313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01973796411473359, + "kl": 0.025097548961639404, + "learning_rate": 7.29835327030493e-08, + "loss": 0.0002, + "num_tokens": 65267323.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5297231674194336, + "sampling/importance_sampling_ratio/mean": 1.0004937648773193, + "sampling/importance_sampling_ratio/min": 0.6882672309875488, + "sampling/sampling_logp_difference/max": 0.42508673667907715, + "sampling/sampling_logp_difference/mean": 0.014410475268959999, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 210.4375, + "completions/mean_terminated_length": 210.4375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.44263601303100586, + "epoch": 2.5355392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8567327170284087, + "kl": 0.033141568303108215, + "learning_rate": 7.261337303855258e-08, + "loss": 0.0189, + "num_tokens": 65300071.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4919159412384033, + "sampling/importance_sampling_ratio/mean": 1.0003559589385986, + "sampling/importance_sampling_ratio/min": 0.7112941741943359, + "sampling/sampling_logp_difference/max": 0.40006113052368164, + "sampling/sampling_logp_difference/mean": 0.017117420211434364, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 201.015625, + "completions/mean_terminated_length": 201.015625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4219868779182434, + "epoch": 2.536764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1735720986413527, + "kl": 0.04875606298446655, + "learning_rate": 7.224408094183299e-08, + "loss": 0.0306, + "num_tokens": 65327272.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4342132806777954, + "sampling/importance_sampling_ratio/mean": 1.0001598596572876, + "sampling/importance_sampling_ratio/min": 0.6172976493835449, + "sampling/sampling_logp_difference/max": 0.4824039936065674, + "sampling/sampling_logp_difference/mean": 0.016299307346343994, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 206.828125, + "completions/mean_terminated_length": 206.828125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.41150209307670593, + "epoch": 2.5379901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.22991337523865, + "kl": 0.04031134024262428, + "learning_rate": 7.187565716252991e-08, + "loss": -0.0319, + "num_tokens": 65355677.0, + "reward": 0.3125, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.44028902053833, + "sampling/importance_sampling_ratio/mean": 0.9997075796127319, + "sampling/importance_sampling_ratio/min": 0.654695451259613, + "sampling/sampling_logp_difference/max": 0.42358505725860596, + "sampling/sampling_logp_difference/mean": 0.014406203292310238, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 197.109375, + "completions/mean_terminated_length": 197.109375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.37234973907470703, + "epoch": 2.5392156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01896846344675893, + "kl": 0.02687685564160347, + "learning_rate": 7.150810244852035e-08, + "loss": 0.0002, + "num_tokens": 65384580.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6081585884094238, + "sampling/importance_sampling_ratio/mean": 1.000108242034912, + "sampling/importance_sampling_ratio/min": 0.6136998534202576, + "sampling/sampling_logp_difference/max": 0.4882493019104004, + "sampling/sampling_logp_difference/mean": 0.015685098245739937, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 182.28125, + "completions/mean_terminated_length": 182.28125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.40313535928726196, + "epoch": 2.5404411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016768388250676957, + "kl": 0.02673422172665596, + "learning_rate": 7.114141754591691e-08, + "loss": 0.0003, + "num_tokens": 65415206.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.551025390625, + "sampling/importance_sampling_ratio/mean": 0.9990107417106628, + "sampling/importance_sampling_ratio/min": 0.6119214296340942, + "sampling/sampling_logp_difference/max": 0.4911513328552246, + "sampling/sampling_logp_difference/mean": 0.015913192182779312, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 173.953125, + "completions/mean_terminated_length": 173.953125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3046499490737915, + "epoch": 2.5416666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01361368443527799, + "kl": 0.020227601751685143, + "learning_rate": 7.077560319906694e-08, + "loss": 0.0002, + "num_tokens": 65443491.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.403162956237793, + "sampling/importance_sampling_ratio/mean": 0.9997733235359192, + "sampling/importance_sampling_ratio/min": 0.5613439679145813, + "sampling/sampling_logp_difference/max": 0.5774214267730713, + "sampling/sampling_logp_difference/mean": 0.012156651355326176, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 230.625, + "completions/mean_terminated_length": 230.625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.30056145787239075, + "epoch": 2.542892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013041721244658643, + "kl": 0.02197718247771263, + "learning_rate": 7.041066015055036e-08, + "loss": 0.0002, + "num_tokens": 65477979.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4096847772598267, + "sampling/importance_sampling_ratio/mean": 1.000238299369812, + "sampling/importance_sampling_ratio/min": 0.49773162603378296, + "sampling/sampling_logp_difference/max": 0.6976943016052246, + "sampling/sampling_logp_difference/mean": 0.012997021898627281, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 185.5, + "completions/mean_terminated_length": 185.5, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3763963282108307, + "epoch": 2.5441176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9779476142011609, + "kl": 0.03305581212043762, + "learning_rate": 7.004658914117822e-08, + "loss": 0.0217, + "num_tokens": 65506347.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6296724081039429, + "sampling/importance_sampling_ratio/mean": 1.0001276731491089, + "sampling/importance_sampling_ratio/min": 0.6284497380256653, + "sampling/sampling_logp_difference/max": 0.48837900161743164, + "sampling/sampling_logp_difference/mean": 0.01456437073647976, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 168.609375, + "completions/mean_terminated_length": 168.609375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3052729368209839, + "epoch": 2.545343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015143259848826397, + "kl": 0.02452005073428154, + "learning_rate": 6.968339090999186e-08, + "loss": 0.0002, + "num_tokens": 65536002.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5346802473068237, + "sampling/importance_sampling_ratio/mean": 1.0001373291015625, + "sampling/importance_sampling_ratio/min": 0.6947748064994812, + "sampling/sampling_logp_difference/max": 0.4283220171928406, + "sampling/sampling_logp_difference/mean": 0.01278787013143301, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 210.0625, + "completions/mean_terminated_length": 210.0625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4861215651035309, + "epoch": 2.5465686274509802, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8034146107780947, + "kl": 0.05419921875, + "learning_rate": 6.932106619426064e-08, + "loss": 0.0064, + "num_tokens": 65569430.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.685737133026123, + "sampling/importance_sampling_ratio/mean": 0.9997113943099976, + "sampling/importance_sampling_ratio/min": 0.7005658745765686, + "sampling/sampling_logp_difference/max": 0.5222029685974121, + "sampling/sampling_logp_difference/mean": 0.01634575054049492, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 165.375, + "completions/mean_terminated_length": 165.375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.35584941506385803, + "epoch": 2.547794117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0645978537399938, + "kl": 0.04145277291536331, + "learning_rate": 6.895961572948067e-08, + "loss": 0.0041, + "num_tokens": 65597182.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5775846242904663, + "sampling/importance_sampling_ratio/mean": 0.9998557567596436, + "sampling/importance_sampling_ratio/min": 0.6304898858070374, + "sampling/sampling_logp_difference/max": 0.4612581729888916, + "sampling/sampling_logp_difference/mean": 0.015014015138149261, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 199.40625, + "completions/mean_terminated_length": 199.40625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.39090844988822937, + "epoch": 2.549019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016649257406454135, + "kl": 0.028168104588985443, + "learning_rate": 6.859904024937347e-08, + "loss": 0.0003, + "num_tokens": 65627784.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4348572492599487, + "sampling/importance_sampling_ratio/mean": 1.0003767013549805, + "sampling/importance_sampling_ratio/min": 0.6622411012649536, + "sampling/sampling_logp_difference/max": 0.4121255874633789, + "sampling/sampling_logp_difference/mean": 0.014203079044818878, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 219.671875, + "completions/mean_terminated_length": 219.671875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.47287118434906006, + "epoch": 2.5502450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8151743040471998, + "kl": 0.06747891008853912, + "learning_rate": 6.823934048588459e-08, + "loss": 0.0163, + "num_tokens": 65658275.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.600164771080017, + "sampling/importance_sampling_ratio/mean": 1.0001966953277588, + "sampling/importance_sampling_ratio/min": 0.7139780521392822, + "sampling/sampling_logp_difference/max": 0.4701066017150879, + "sampling/sampling_logp_difference/mean": 0.0164007730782032, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 253.5, + "completions/mean_terminated_length": 253.5, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.5424678325653076, + "epoch": 2.5514705882352944, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3165769753311134, + "kl": 0.048580899834632874, + "learning_rate": 6.78805171691817e-08, + "loss": -0.0135, + "num_tokens": 65695171.0, + "reward": 0.53125, + "reward_std": 0.6331988573074341, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5378328561782837, + "sampling/importance_sampling_ratio/mean": 0.9999392628669739, + "sampling/importance_sampling_ratio/min": 0.6459082365036011, + "sampling/sampling_logp_difference/max": 0.4370979070663452, + "sampling/sampling_logp_difference/mean": 0.01762511022388935, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 195.703125, + "completions/mean_terminated_length": 195.703125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.4645097851753235, + "epoch": 2.5526960784313726, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5832438382196996, + "kl": 0.06714235246181488, + "learning_rate": 6.752257102765324e-08, + "loss": 0.0134, + "num_tokens": 65733312.0, + "reward": 0.4375, + "reward_std": 0.6311737298965454, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.7630969285964966, + "sampling/importance_sampling_ratio/mean": 1.0013240575790405, + "sampling/importance_sampling_ratio/min": 0.5906386971473694, + "sampling/sampling_logp_difference/max": 0.5670719146728516, + "sampling/sampling_logp_difference/mean": 0.016704872250556946, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 249.953125, + "completions/mean_terminated_length": 249.953125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.4421851634979248, + "epoch": 2.553921568627451, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0813640317510225, + "kl": 0.03265643119812012, + "learning_rate": 6.716550278790739e-08, + "loss": -0.0087, + "num_tokens": 65770685.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5154715776443481, + "sampling/importance_sampling_ratio/mean": 1.0001493692398071, + "sampling/importance_sampling_ratio/min": 0.6494601964950562, + "sampling/sampling_logp_difference/max": 0.4316136837005615, + "sampling/sampling_logp_difference/mean": 0.015306571498513222, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 188.890625, + "completions/mean_terminated_length": 188.890625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3632579743862152, + "epoch": 2.5551470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8485382711143361, + "kl": 0.05607233941555023, + "learning_rate": 6.680931317476996e-08, + "loss": -0.01, + "num_tokens": 65797350.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3961634635925293, + "sampling/importance_sampling_ratio/mean": 1.0000603199005127, + "sampling/importance_sampling_ratio/min": 0.6622359156608582, + "sampling/sampling_logp_difference/max": 0.41213345527648926, + "sampling/sampling_logp_difference/mean": 0.013949640095233917, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 190.140625, + "completions/mean_terminated_length": 190.140625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.5942714214324951, + "epoch": 2.556372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.907817282126184, + "kl": 0.07968472689390182, + "learning_rate": 6.645400291128356e-08, + "loss": -0.0028, + "num_tokens": 65834143.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4339807033538818, + "sampling/importance_sampling_ratio/mean": 0.9993922710418701, + "sampling/importance_sampling_ratio/min": 0.61481773853302, + "sampling/sampling_logp_difference/max": 0.48642945289611816, + "sampling/sampling_logp_difference/mean": 0.019588496536016464, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 238.734375, + "completions/mean_terminated_length": 238.734375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.48568403720855713, + "epoch": 2.5575980392156863, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9744634120275073, + "kl": 0.0538434162735939, + "learning_rate": 6.609957271870503e-08, + "loss": 0.0087, + "num_tokens": 65869518.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5924615859985352, + "sampling/importance_sampling_ratio/mean": 1.000096082687378, + "sampling/importance_sampling_ratio/min": 0.6778700351715088, + "sampling/sampling_logp_difference/max": 0.46528100967407227, + "sampling/sampling_logp_difference/mean": 0.016062160953879356, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 197.296875, + "completions/mean_terminated_length": 197.296875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3151473104953766, + "epoch": 2.5588235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018567241166384255, + "kl": 0.023612767457962036, + "learning_rate": 6.574602331650559e-08, + "loss": 0.0002, + "num_tokens": 65898369.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5695054531097412, + "sampling/importance_sampling_ratio/mean": 0.9994983673095703, + "sampling/importance_sampling_ratio/min": 0.7195582985877991, + "sampling/sampling_logp_difference/max": 0.4507606029510498, + "sampling/sampling_logp_difference/mean": 0.01402480062097311, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 197.34375, + "completions/mean_terminated_length": 197.34375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.445318341255188, + "epoch": 2.560049019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2179666722930946, + "kl": 0.03929883986711502, + "learning_rate": 6.539335542236802e-08, + "loss": 0.0111, + "num_tokens": 65929911.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5594457387924194, + "sampling/importance_sampling_ratio/mean": 1.000316858291626, + "sampling/importance_sampling_ratio/min": 0.6546961069107056, + "sampling/sampling_logp_difference/max": 0.44433045387268066, + "sampling/sampling_logp_difference/mean": 0.01643485575914383, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 180.421875, + "completions/mean_terminated_length": 180.421875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.31945115327835083, + "epoch": 2.561274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01948292048956947, + "kl": 0.026923775672912598, + "learning_rate": 6.504156975218567e-08, + "loss": 0.0002, + "num_tokens": 65955698.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4169365167617798, + "sampling/importance_sampling_ratio/mean": 0.9995955228805542, + "sampling/importance_sampling_ratio/min": 0.6281131505966187, + "sampling/sampling_logp_difference/max": 0.46503496170043945, + "sampling/sampling_logp_difference/mean": 0.014066488482058048, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 257.484375, + "completions/mean_terminated_length": 257.484375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4186283051967621, + "epoch": 2.5625, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9985697505810378, + "kl": 0.025043699890375137, + "learning_rate": 6.469066702006137e-08, + "loss": 0.0695, + "num_tokens": 65988593.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5223242044448853, + "sampling/importance_sampling_ratio/mean": 0.9997868537902832, + "sampling/importance_sampling_ratio/min": 0.6325936317443848, + "sampling/sampling_logp_difference/max": 0.4579271078109741, + "sampling/sampling_logp_difference/mean": 0.016202857717871666, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 215.578125, + "completions/mean_terminated_length": 215.578125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.40918493270874023, + "epoch": 2.563725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02424830290448722, + "kl": 0.03615477308630943, + "learning_rate": 6.43406479383053e-08, + "loss": 0.0004, + "num_tokens": 66017734.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4315046072006226, + "sampling/importance_sampling_ratio/mean": 0.9997341632843018, + "sampling/importance_sampling_ratio/min": 0.6621752977371216, + "sampling/sampling_logp_difference/max": 0.41222500801086426, + "sampling/sampling_logp_difference/mean": 0.014971710741519928, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 135.828125, + "completions/mean_terminated_length": 135.828125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.2993594706058502, + "epoch": 2.564950980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027237160966415087, + "kl": 0.0391191691160202, + "learning_rate": 6.399151321743423e-08, + "loss": 0.0004, + "num_tokens": 66037691.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.480431079864502, + "sampling/importance_sampling_ratio/mean": 1.000072956085205, + "sampling/importance_sampling_ratio/min": 0.661615252494812, + "sampling/sampling_logp_difference/max": 0.4130711555480957, + "sampling/sampling_logp_difference/mean": 0.014084149152040482, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 221.265625, + "completions/mean_terminated_length": 221.265625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3643258213996887, + "epoch": 2.5661764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011489225833486107, + "kl": 0.021840862929821014, + "learning_rate": 6.364326356616917e-08, + "loss": 0.0002, + "num_tokens": 66077052.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6226626634597778, + "sampling/importance_sampling_ratio/mean": 0.9994646906852722, + "sampling/importance_sampling_ratio/min": 0.6210864782333374, + "sampling/sampling_logp_difference/max": 0.4840683937072754, + "sampling/sampling_logp_difference/mean": 0.014700526371598244, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 188.046875, + "completions/mean_terminated_length": 188.046875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.33661404252052307, + "epoch": 2.5674019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014335949900874318, + "kl": 0.022317927330732346, + "learning_rate": 6.329589969143517e-08, + "loss": 0.0002, + "num_tokens": 66107103.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.354552984237671, + "sampling/importance_sampling_ratio/mean": 0.9997498989105225, + "sampling/importance_sampling_ratio/min": 0.6162945628166199, + "sampling/sampling_logp_difference/max": 0.48403024673461914, + "sampling/sampling_logp_difference/mean": 0.013629063963890076, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 211.28125, + "completions/mean_terminated_length": 211.28125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.4470069706439972, + "epoch": 2.568627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.75070318927903, + "kl": 0.05030011385679245, + "learning_rate": 6.29494222983587e-08, + "loss": 0.0328, + "num_tokens": 66145953.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4384031295776367, + "sampling/importance_sampling_ratio/mean": 0.999783456325531, + "sampling/importance_sampling_ratio/min": 0.6539222002029419, + "sampling/sampling_logp_difference/max": 0.4247668981552124, + "sampling/sampling_logp_difference/mean": 0.015024135820567608, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 206.34375, + "completions/mean_terminated_length": 206.34375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.4453170895576477, + "epoch": 2.5698529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9820557513300701, + "kl": 0.04464089497923851, + "learning_rate": 6.260383209026704e-08, + "loss": 0.0161, + "num_tokens": 66179607.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5414364337921143, + "sampling/importance_sampling_ratio/mean": 0.9994871616363525, + "sampling/importance_sampling_ratio/min": 0.6254764199256897, + "sampling/sampling_logp_difference/max": 0.4692416191101074, + "sampling/sampling_logp_difference/mean": 0.016002651304006577, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 227.75, + "completions/mean_terminated_length": 227.75, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4869428277015686, + "epoch": 2.571078431372549, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.2831465130654096, + "kl": 0.08460487425327301, + "learning_rate": 6.225912976868636e-08, + "loss": 0.017, + "num_tokens": 66214071.0, + "reward": 0.21875, + "reward_std": 0.5539814233779907, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5390594005584717, + "sampling/importance_sampling_ratio/mean": 1.0002871751785278, + "sampling/importance_sampling_ratio/min": 0.5630738735198975, + "sampling/sampling_logp_difference/max": 0.5743444561958313, + "sampling/sampling_logp_difference/mean": 0.017172526568174362, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 219.484375, + "completions/mean_terminated_length": 219.484375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.3674548864364624, + "epoch": 2.5723039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021997753395319782, + "kl": 0.03306593745946884, + "learning_rate": 6.191531603334044e-08, + "loss": 0.0003, + "num_tokens": 66243110.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5714753866195679, + "sampling/importance_sampling_ratio/mean": 0.9999969005584717, + "sampling/importance_sampling_ratio/min": 0.5362951159477234, + "sampling/sampling_logp_difference/max": 0.6230707168579102, + "sampling/sampling_logp_difference/mean": 0.014687325805425644, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 211.828125, + "completions/mean_terminated_length": 211.828125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.30457037687301636, + "epoch": 2.5735294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012549578551228192, + "kl": 0.01527449395507574, + "learning_rate": 6.157239158214966e-08, + "loss": 0.0002, + "num_tokens": 66279771.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5515893697738647, + "sampling/importance_sampling_ratio/mean": 0.9998092651367188, + "sampling/importance_sampling_ratio/min": 0.6406391859054565, + "sampling/sampling_logp_difference/max": 0.44528889656066895, + "sampling/sampling_logp_difference/mean": 0.012442250736057758, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 172.140625, + "completions/mean_terminated_length": 172.140625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3210659325122833, + "epoch": 2.5747549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023644061330135357, + "kl": 0.02690596878528595, + "learning_rate": 6.123035711122859e-08, + "loss": 0.0003, + "num_tokens": 66308676.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 0.9999759197235107, + "sampling/importance_sampling_ratio/min": 0.6546952724456787, + "sampling/sampling_logp_difference/max": 0.49199533462524414, + "sampling/sampling_logp_difference/mean": 0.014081955887377262, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 200.515625, + "completions/mean_terminated_length": 200.515625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.374914288520813, + "epoch": 2.575980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018392236450731337, + "kl": 0.02459472417831421, + "learning_rate": 6.088921331488566e-08, + "loss": 0.0002, + "num_tokens": 66338325.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.363040804862976, + "sampling/importance_sampling_ratio/mean": 1.000260591506958, + "sampling/importance_sampling_ratio/min": 0.5137227177619934, + "sampling/sampling_logp_difference/max": 0.6660715937614441, + "sampling/sampling_logp_difference/mean": 0.014474395662546158, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 159.71875, + "completions/mean_terminated_length": 159.71875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3740498721599579, + "epoch": 2.577205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9346502239688184, + "kl": 0.028557591140270233, + "learning_rate": 6.05489608856214e-08, + "loss": -0.0112, + "num_tokens": 66365443.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3664571046829224, + "sampling/importance_sampling_ratio/mean": 0.9995359182357788, + "sampling/importance_sampling_ratio/min": 0.6773549318313599, + "sampling/sampling_logp_difference/max": 0.38955986499786377, + "sampling/sampling_logp_difference/mean": 0.014828909188508987, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 222.9375, + "completions/mean_terminated_length": 222.9375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.40943849086761475, + "epoch": 2.5784313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01977636754486806, + "kl": 0.02581152133643627, + "learning_rate": 6.020960051412638e-08, + "loss": 0.0003, + "num_tokens": 66396703.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4477920532226562, + "sampling/importance_sampling_ratio/mean": 1.000298261642456, + "sampling/importance_sampling_ratio/min": 0.5010133385658264, + "sampling/sampling_logp_difference/max": 0.6911225318908691, + "sampling/sampling_logp_difference/mean": 0.01465803012251854, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 211.140625, + "completions/mean_terminated_length": 211.140625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3674590289592743, + "epoch": 2.579656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01807957295302773, + "kl": 0.019815169274806976, + "learning_rate": 5.98711328892808e-08, + "loss": 0.0002, + "num_tokens": 66429192.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6007360219955444, + "sampling/importance_sampling_ratio/mean": 1.0005848407745361, + "sampling/importance_sampling_ratio/min": 0.6564944982528687, + "sampling/sampling_logp_difference/max": 0.47046351432800293, + "sampling/sampling_logp_difference/mean": 0.014578346163034439, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 193.484375, + "completions/mean_terminated_length": 193.484375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.47036683559417725, + "epoch": 2.5808823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9480830279283758, + "kl": 0.06246185302734375, + "learning_rate": 5.9533558698152355e-08, + "loss": -0.0002, + "num_tokens": 66460087.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4385663270950317, + "sampling/importance_sampling_ratio/mean": 1.0000579357147217, + "sampling/importance_sampling_ratio/min": 0.6575733423233032, + "sampling/sampling_logp_difference/max": 0.41919898986816406, + "sampling/sampling_logp_difference/mean": 0.016523167490959167, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 236.078125, + "completions/mean_terminated_length": 236.078125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.4175148010253906, + "epoch": 2.582107843137255, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9270753704537925, + "kl": 0.04759068042039871, + "learning_rate": 5.919687862599548e-08, + "loss": 0.0208, + "num_tokens": 66494028.0, + "reward": 0.28125, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.4843947887420654, + "sampling/importance_sampling_ratio/mean": 0.9996776580810547, + "sampling/importance_sampling_ratio/min": 0.6776774525642395, + "sampling/sampling_logp_difference/max": 0.3950071334838867, + "sampling/sampling_logp_difference/mean": 0.014096850529313087, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 178.796875, + "completions/mean_terminated_length": 178.796875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3843807578086853, + "epoch": 2.5833333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01602146693048434, + "kl": 0.028620852157473564, + "learning_rate": 5.886109335624928e-08, + "loss": 0.0003, + "num_tokens": 66525407.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.564172387123108, + "sampling/importance_sampling_ratio/mean": 1.0002281665802002, + "sampling/importance_sampling_ratio/min": 0.6718854904174805, + "sampling/sampling_logp_difference/max": 0.4473569393157959, + "sampling/sampling_logp_difference/mean": 0.016141705214977264, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 131.109375, + "completions/mean_terminated_length": 131.109375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.28437256813049316, + "epoch": 2.5845588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02630938554479464, + "kl": 0.028298180550336838, + "learning_rate": 5.8526203570536504e-08, + "loss": 0.0003, + "num_tokens": 66546230.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3972299098968506, + "sampling/importance_sampling_ratio/mean": 0.9994284510612488, + "sampling/importance_sampling_ratio/min": 0.613646924495697, + "sampling/sampling_logp_difference/max": 0.48833560943603516, + "sampling/sampling_logp_difference/mean": 0.014699292369186878, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 155.953125, + "completions/mean_terminated_length": 155.953125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.3374735713005066, + "epoch": 2.5857843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02280321129490057, + "kl": 0.04040870815515518, + "learning_rate": 5.819220994866236e-08, + "loss": 0.0004, + "num_tokens": 66571299.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4676449298858643, + "sampling/importance_sampling_ratio/mean": 0.9996737837791443, + "sampling/importance_sampling_ratio/min": 0.6067817211151123, + "sampling/sampling_logp_difference/max": 0.4995861053466797, + "sampling/sampling_logp_difference/mean": 0.013758618384599686, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 168.734375, + "completions/mean_terminated_length": 168.734375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3715071678161621, + "epoch": 2.5870098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01729749422561657, + "kl": 0.03268400579690933, + "learning_rate": 5.7859113168612696e-08, + "loss": 0.0003, + "num_tokens": 66600466.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4979567527770996, + "sampling/importance_sampling_ratio/mean": 1.0001771450042725, + "sampling/importance_sampling_ratio/min": 0.6424275040626526, + "sampling/sampling_logp_difference/max": 0.4425013065338135, + "sampling/sampling_logp_difference/mean": 0.015010682865977287, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 205.0, + "completions/mean_terminated_length": 205.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.41879019141197205, + "epoch": 2.588235294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.794542944353632, + "kl": 0.03571142628788948, + "learning_rate": 5.7526913906552786e-08, + "loss": 0.0208, + "num_tokens": 66639106.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5448921918869019, + "sampling/importance_sampling_ratio/mean": 1.0000910758972168, + "sampling/importance_sampling_ratio/min": 0.6546945571899414, + "sampling/sampling_logp_difference/max": 0.4349541664123535, + "sampling/sampling_logp_difference/mean": 0.0153944818302989, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 237.6875, + "completions/mean_terminated_length": 237.6875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.36095866560935974, + "epoch": 2.5894607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.83986554242995, + "kl": 0.03923949971795082, + "learning_rate": 5.7195612836826055e-08, + "loss": 0.0041, + "num_tokens": 66671694.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.3939363956451416, + "sampling/importance_sampling_ratio/mean": 0.9999719858169556, + "sampling/importance_sampling_ratio/min": 0.5363647937774658, + "sampling/sampling_logp_difference/max": 0.6229407787322998, + "sampling/sampling_logp_difference/mean": 0.013769697397947311, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 208.328125, + "completions/mean_terminated_length": 208.328125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.41904306411743164, + "epoch": 2.590686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9101347229153239, + "kl": 0.028980256989598274, + "learning_rate": 5.686521063195287e-08, + "loss": 0.0194, + "num_tokens": 66703491.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4888678789138794, + "sampling/importance_sampling_ratio/mean": 0.9995335936546326, + "sampling/importance_sampling_ratio/min": 0.682695209980011, + "sampling/sampling_logp_difference/max": 0.39801597595214844, + "sampling/sampling_logp_difference/mean": 0.0153053505346179, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 220.625, + "completions/mean_terminated_length": 220.625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3330810070037842, + "epoch": 2.5919117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012560079172912738, + "kl": 0.019664783030748367, + "learning_rate": 5.6535707962628685e-08, + "loss": 0.0002, + "num_tokens": 66741099.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4760103225708008, + "sampling/importance_sampling_ratio/mean": 1.000353455543518, + "sampling/importance_sampling_ratio/min": 0.376099169254303, + "sampling/sampling_logp_difference/max": 0.9779024124145508, + "sampling/sampling_logp_difference/mean": 0.012436985038220882, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 182.953125, + "completions/mean_terminated_length": 182.953125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.5205093026161194, + "epoch": 2.593137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9135108534463635, + "kl": 0.04284369945526123, + "learning_rate": 5.620710549772295e-08, + "loss": -0.0014, + "num_tokens": 66775832.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4819895029067993, + "sampling/importance_sampling_ratio/mean": 1.0004993677139282, + "sampling/importance_sampling_ratio/min": 0.720510721206665, + "sampling/sampling_logp_difference/max": 0.3933854103088379, + "sampling/sampling_logp_difference/mean": 0.017911650240421295, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 182.296875, + "completions/mean_terminated_length": 182.296875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.362684965133667, + "epoch": 2.594362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019456104270723005, + "kl": 0.028201991692185402, + "learning_rate": 5.5879403904278034e-08, + "loss": 0.0003, + "num_tokens": 66802315.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4301707744598389, + "sampling/importance_sampling_ratio/mean": 0.9996477365493774, + "sampling/importance_sampling_ratio/min": 0.5825095772743225, + "sampling/sampling_logp_difference/max": 0.5404096841812134, + "sampling/sampling_logp_difference/mean": 0.015181276947259903, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 194.265625, + "completions/mean_terminated_length": 194.265625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.4143337309360504, + "epoch": 2.5955882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2345566335019353, + "kl": 0.07903826236724854, + "learning_rate": 5.555260384750721e-08, + "loss": 0.0075, + "num_tokens": 66830556.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4668275117874146, + "sampling/importance_sampling_ratio/mean": 0.9987766742706299, + "sampling/importance_sampling_ratio/min": 0.6479809880256653, + "sampling/sampling_logp_difference/max": 0.43389391899108887, + "sampling/sampling_logp_difference/mean": 0.01558046042919159, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 209.484375, + "completions/mean_terminated_length": 209.484375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.36364811658859253, + "epoch": 2.596813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013713065630409327, + "kl": 0.021273497492074966, + "learning_rate": 5.5226705990794156e-08, + "loss": 0.0002, + "num_tokens": 66868315.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6138299703598022, + "sampling/importance_sampling_ratio/mean": 0.9999983310699463, + "sampling/importance_sampling_ratio/min": 0.6207950115203857, + "sampling/sampling_logp_difference/max": 0.4786102771759033, + "sampling/sampling_logp_difference/mean": 0.01500872615724802, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 261.0, + "completions/mean_terminated_length": 261.0, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.4230864346027374, + "epoch": 2.5980392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8143844606419269, + "kl": 0.01911357045173645, + "learning_rate": 5.4901710995690576e-08, + "loss": -0.0274, + "num_tokens": 66903019.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6599781513214111, + "sampling/importance_sampling_ratio/mean": 1.0002446174621582, + "sampling/importance_sampling_ratio/min": 0.500320315361023, + "sampling/sampling_logp_difference/max": 0.6925067901611328, + "sampling/sampling_logp_difference/mean": 0.014856458641588688, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 202.1875, + "completions/mean_terminated_length": 202.1875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.4207639992237091, + "epoch": 2.599264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03326433934880384, + "kl": 0.05689454451203346, + "learning_rate": 5.4577619521915916e-08, + "loss": 0.0006, + "num_tokens": 66934471.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5942623615264893, + "sampling/importance_sampling_ratio/mean": 1.0002871751785278, + "sampling/importance_sampling_ratio/min": 0.6772719621658325, + "sampling/sampling_logp_difference/max": 0.46641111373901367, + "sampling/sampling_logp_difference/mean": 0.01560080423951149, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 222.953125, + "completions/mean_terminated_length": 222.953125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.33361703157424927, + "epoch": 2.6004901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8840086549830525, + "kl": 0.024904098361730576, + "learning_rate": 5.425443222735526e-08, + "loss": -0.0015, + "num_tokens": 66964452.0, + "reward": -0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4255083799362183, + "sampling/importance_sampling_ratio/mean": 1.0000474452972412, + "sampling/importance_sampling_ratio/min": 0.7039909362792969, + "sampling/sampling_logp_difference/max": 0.35452842712402344, + "sampling/sampling_logp_difference/mean": 0.012197159230709076, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 253.734375, + "completions/mean_terminated_length": 253.734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.5072479844093323, + "epoch": 2.6017156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6787254225108151, + "kl": 0.052853070199489594, + "learning_rate": 5.393214976805832e-08, + "loss": -0.0094, + "num_tokens": 67003027.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5244715213775635, + "sampling/importance_sampling_ratio/mean": 1.0005197525024414, + "sampling/importance_sampling_ratio/min": 0.6650070548057556, + "sampling/sampling_logp_difference/max": 0.4216477870941162, + "sampling/sampling_logp_difference/mean": 0.016826054081320763, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 178.234375, + "completions/mean_terminated_length": 178.234375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.36420899629592896, + "epoch": 2.6029411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03384692700289003, + "kl": 0.032734621316194534, + "learning_rate": 5.361077279823817e-08, + "loss": 0.0003, + "num_tokens": 67031010.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6157547235488892, + "sampling/importance_sampling_ratio/mean": 0.9999369978904724, + "sampling/importance_sampling_ratio/min": 0.6351216435432434, + "sampling/sampling_logp_difference/max": 0.47980213165283203, + "sampling/sampling_logp_difference/mean": 0.01496285479515791, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 237.203125, + "completions/mean_terminated_length": 237.203125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.38095271587371826, + "epoch": 2.6041666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6323050600572573, + "kl": 0.03123561292886734, + "learning_rate": 5.3290301970269514e-08, + "loss": 0.0046, + "num_tokens": 67063119.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7322168350219727, + "sampling/importance_sampling_ratio/mean": 1.0004644393920898, + "sampling/importance_sampling_ratio/min": 0.6108182072639465, + "sampling/sampling_logp_difference/max": 0.5494019985198975, + "sampling/sampling_logp_difference/mean": 0.01421417761594057, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 158.78125, + "completions/mean_terminated_length": 158.78125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3642742335796356, + "epoch": 2.605392156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034789694831971765, + "kl": 0.05975145846605301, + "learning_rate": 5.29707379346882e-08, + "loss": 0.0006, + "num_tokens": 67088753.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6307361125946045, + "sampling/importance_sampling_ratio/mean": 1.0000109672546387, + "sampling/importance_sampling_ratio/min": 0.6289353966712952, + "sampling/sampling_logp_difference/max": 0.4890315532684326, + "sampling/sampling_logp_difference/mean": 0.015447361394762993, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 227.78125, + "completions/mean_terminated_length": 227.78125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.36701464653015137, + "epoch": 2.6066176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7879647133880849, + "kl": 0.022777989506721497, + "learning_rate": 5.2652081340188506e-08, + "loss": -0.0569, + "num_tokens": 67122307.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997912049293518, + "sampling/importance_sampling_ratio/min": 0.6298379898071289, + "sampling/sampling_logp_difference/max": 0.8105654716491699, + "sampling/sampling_logp_difference/mean": 0.014233799651265144, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 204.25, + "completions/mean_terminated_length": 204.25, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3423210680484772, + "epoch": 2.607843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0162284263371988, + "kl": 0.024179883301258087, + "learning_rate": 5.2334332833623487e-08, + "loss": 0.0002, + "num_tokens": 67155763.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5865659713745117, + "sampling/importance_sampling_ratio/mean": 1.000349998474121, + "sampling/importance_sampling_ratio/min": 0.6727445125579834, + "sampling/sampling_logp_difference/max": 0.46157193183898926, + "sampling/sampling_logp_difference/mean": 0.014300025999546051, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 230.375, + "completions/mean_terminated_length": 230.375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.4718915522098541, + "epoch": 2.6090686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028572234533195292, + "kl": 0.03860338404774666, + "learning_rate": 5.2017493060002196e-08, + "loss": 0.0004, + "num_tokens": 67188203.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3974932432174683, + "sampling/importance_sampling_ratio/mean": 0.9992986917495728, + "sampling/importance_sampling_ratio/min": 0.6844436526298523, + "sampling/sampling_logp_difference/max": 0.3791489601135254, + "sampling/sampling_logp_difference/mean": 0.01681319996714592, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 203.109375, + "completions/mean_terminated_length": 203.109375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.376884400844574, + "epoch": 2.610294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0171315248231306, + "kl": 0.022489607334136963, + "learning_rate": 5.1701562662489596e-08, + "loss": 0.0002, + "num_tokens": 67222898.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4988843202590942, + "sampling/importance_sampling_ratio/mean": 1.000124216079712, + "sampling/importance_sampling_ratio/min": 0.6411409974098206, + "sampling/sampling_logp_difference/max": 0.4445059299468994, + "sampling/sampling_logp_difference/mean": 0.015070099383592606, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 226.421875, + "completions/mean_terminated_length": 226.421875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.43235793709754944, + "epoch": 2.611519607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6793980733597528, + "kl": 0.060259003192186356, + "learning_rate": 5.138654228240424e-08, + "loss": 0.0097, + "num_tokens": 67255853.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.3672351837158203, + "sampling/importance_sampling_ratio/mean": 0.9999115467071533, + "sampling/importance_sampling_ratio/min": 0.5359736084938049, + "sampling/sampling_logp_difference/max": 0.6236703395843506, + "sampling/sampling_logp_difference/mean": 0.015460221096873283, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 155.328125, + "completions/mean_terminated_length": 155.328125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.39062735438346863, + "epoch": 2.6127450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9959490995274431, + "kl": 0.02826238051056862, + "learning_rate": 5.1072432559217446e-08, + "loss": 0.0213, + "num_tokens": 67284562.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2868387699127197, + "sampling/importance_sampling_ratio/mean": 1.000374436378479, + "sampling/importance_sampling_ratio/min": 0.5039337873458862, + "sampling/sampling_logp_difference/max": 0.6853103637695312, + "sampling/sampling_logp_difference/mean": 0.014396356418728828, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 203.28125, + "completions/mean_terminated_length": 203.28125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.4167785048484802, + "epoch": 2.6139705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015378856424382125, + "kl": 0.022126592695713043, + "learning_rate": 5.075923413055222e-08, + "loss": 0.0002, + "num_tokens": 67313892.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5830098390579224, + "sampling/importance_sampling_ratio/mean": 0.9998418092727661, + "sampling/importance_sampling_ratio/min": 0.6997085809707642, + "sampling/sampling_logp_difference/max": 0.45932793617248535, + "sampling/sampling_logp_difference/mean": 0.0156828835606575, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 217.296875, + "completions/mean_terminated_length": 217.296875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3475598096847534, + "epoch": 2.6151960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7135612162620825, + "kl": 0.02928490936756134, + "learning_rate": 5.044694763218149e-08, + "loss": -0.0157, + "num_tokens": 67343959.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4025582075119019, + "sampling/importance_sampling_ratio/mean": 0.9999585151672363, + "sampling/importance_sampling_ratio/min": 0.6419162750244141, + "sampling/sampling_logp_difference/max": 0.4432973861694336, + "sampling/sampling_logp_difference/mean": 0.01262687612324953, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 198.625, + "completions/mean_terminated_length": 198.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3160248398780823, + "epoch": 2.616421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01667655163182362, + "kl": 0.01977652497589588, + "learning_rate": 5.013557369802701e-08, + "loss": 0.0002, + "num_tokens": 67374191.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5577393770217896, + "sampling/importance_sampling_ratio/mean": 1.0001202821731567, + "sampling/importance_sampling_ratio/min": 0.7577202320098877, + "sampling/sampling_logp_difference/max": 0.4432356357574463, + "sampling/sampling_logp_difference/mean": 0.012885408475995064, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 172.453125, + "completions/mean_terminated_length": 172.453125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.39916563034057617, + "epoch": 2.6176470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020251231669469082, + "kl": 0.027808837592601776, + "learning_rate": 4.982511296015807e-08, + "loss": 0.0003, + "num_tokens": 67400700.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8188602924346924, + "sampling/importance_sampling_ratio/mean": 1.000488519668579, + "sampling/importance_sampling_ratio/min": 0.6121949553489685, + "sampling/sampling_logp_difference/max": 0.5982100963592529, + "sampling/sampling_logp_difference/mean": 0.016949813812971115, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 189.359375, + "completions/mean_terminated_length": 189.359375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3966854512691498, + "epoch": 2.618872549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035290026610468254, + "kl": 0.06541240215301514, + "learning_rate": 4.951556604879048e-08, + "loss": 0.0006, + "num_tokens": 67429363.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6362992525100708, + "sampling/importance_sampling_ratio/mean": 0.999947190284729, + "sampling/importance_sampling_ratio/min": 0.7212660908699036, + "sampling/sampling_logp_difference/max": 0.49243712425231934, + "sampling/sampling_logp_difference/mean": 0.015420676209032536, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 211.71875, + "completions/mean_terminated_length": 211.71875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4054481089115143, + "epoch": 2.6200980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7655339023697475, + "kl": 0.05712512135505676, + "learning_rate": 4.9206933592284725e-08, + "loss": -0.0016, + "num_tokens": 67467905.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.439152717590332, + "sampling/importance_sampling_ratio/mean": 0.9993346333503723, + "sampling/importance_sampling_ratio/min": 0.5603407025337219, + "sampling/sampling_logp_difference/max": 0.5792102813720703, + "sampling/sampling_logp_difference/mean": 0.015379207208752632, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 208.359375, + "completions/mean_terminated_length": 208.359375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.38255298137664795, + "epoch": 2.6213235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013470305053340506, + "kl": 0.023093286901712418, + "learning_rate": 4.889921621714516e-08, + "loss": 0.0002, + "num_tokens": 67505992.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.54817533493042, + "sampling/importance_sampling_ratio/mean": 0.9997929334640503, + "sampling/importance_sampling_ratio/min": 0.6157702207565308, + "sampling/sampling_logp_difference/max": 0.4848814010620117, + "sampling/sampling_logp_difference/mean": 0.01380122546106577, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 227.03125, + "completions/mean_terminated_length": 227.03125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.46854493021965027, + "epoch": 2.622549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2791723949659457, + "kl": 0.03231387585401535, + "learning_rate": 4.859241454801866e-08, + "loss": 0.0058, + "num_tokens": 67541402.0, + "reward": 0.4375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.664849042892456, + "sampling/importance_sampling_ratio/mean": 0.9997316598892212, + "sampling/importance_sampling_ratio/min": 0.6409977674484253, + "sampling/sampling_logp_difference/max": 0.5097345113754272, + "sampling/sampling_logp_difference/mean": 0.017020462080836296, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 216.703125, + "completions/mean_terminated_length": 216.703125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.30431610345840454, + "epoch": 2.623774509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016013747254574693, + "kl": 0.023328766226768494, + "learning_rate": 4.828652920769311e-08, + "loss": 0.0002, + "num_tokens": 67573559.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.449704647064209, + "sampling/importance_sampling_ratio/mean": 1.0002050399780273, + "sampling/importance_sampling_ratio/min": 0.7126727104187012, + "sampling/sampling_logp_difference/max": 0.37135982513427734, + "sampling/sampling_logp_difference/mean": 0.011685984209179878, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 167.40625, + "completions/mean_terminated_length": 167.40625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.34901028871536255, + "epoch": 2.625, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0236117528688996, + "kl": 0.025571728125214577, + "learning_rate": 4.7981560817096366e-08, + "loss": 0.0003, + "num_tokens": 67601185.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4754151105880737, + "sampling/importance_sampling_ratio/mean": 1.0006507635116577, + "sampling/importance_sampling_ratio/min": 0.663468599319458, + "sampling/sampling_logp_difference/max": 0.41027379035949707, + "sampling/sampling_logp_difference/mean": 0.01529423613101244, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 160.46875, + "completions/mean_terminated_length": 160.46875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3441200852394104, + "epoch": 2.626225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8936793610125214, + "kl": 0.031151149421930313, + "learning_rate": 4.767750999529485e-08, + "loss": 0.0081, + "num_tokens": 67625711.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6312365531921387, + "sampling/importance_sampling_ratio/mean": 1.0001682043075562, + "sampling/importance_sampling_ratio/min": 0.6543079018592834, + "sampling/sampling_logp_difference/max": 0.48933839797973633, + "sampling/sampling_logp_difference/mean": 0.014745216816663742, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 184.265625, + "completions/mean_terminated_length": 184.265625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.31365156173706055, + "epoch": 2.627450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0837940597102713, + "kl": 0.019635751843452454, + "learning_rate": 4.7374377359492624e-08, + "loss": 0.0229, + "num_tokens": 67655728.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.311972975730896, + "sampling/importance_sampling_ratio/mean": 0.9995397925376892, + "sampling/importance_sampling_ratio/min": 0.38968032598495483, + "sampling/sampling_logp_difference/max": 0.9424285888671875, + "sampling/sampling_logp_difference/mean": 0.01384140457957983, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 187.453125, + "completions/mean_terminated_length": 187.453125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4393818974494934, + "epoch": 2.6286764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018799716519235687, + "kl": 0.025906246155500412, + "learning_rate": 4.707216352502974e-08, + "loss": 0.0003, + "num_tokens": 67684461.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.609341025352478, + "sampling/importance_sampling_ratio/mean": 0.9995216131210327, + "sampling/importance_sampling_ratio/min": 0.6207955479621887, + "sampling/sampling_logp_difference/max": 0.47675347328186035, + "sampling/sampling_logp_difference/mean": 0.01651901751756668, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 250.375, + "completions/mean_terminated_length": 250.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.49239933490753174, + "epoch": 2.6299019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8208077058143795, + "kl": 0.03966863825917244, + "learning_rate": 4.6770869105380914e-08, + "loss": -0.0083, + "num_tokens": 67722757.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4605991840362549, + "sampling/importance_sampling_ratio/mean": 0.9996809363365173, + "sampling/importance_sampling_ratio/min": 0.660297155380249, + "sampling/sampling_logp_difference/max": 0.41506528854370117, + "sampling/sampling_logp_difference/mean": 0.016705110669136047, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 179.546875, + "completions/mean_terminated_length": 179.546875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3138548731803894, + "epoch": 2.631127450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018924036857109135, + "kl": 0.030965154990553856, + "learning_rate": 4.647049471215497e-08, + "loss": 0.0003, + "num_tokens": 67752376.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3848775625228882, + "sampling/importance_sampling_ratio/mean": 0.9996682405471802, + "sampling/importance_sampling_ratio/min": 0.6506453156471252, + "sampling/sampling_logp_difference/max": 0.4297906160354614, + "sampling/sampling_logp_difference/mean": 0.01328134536743164, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 177.0625, + "completions/mean_terminated_length": 177.0625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.38742679357528687, + "epoch": 2.6323529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9040913443069275, + "kl": 0.04941537231206894, + "learning_rate": 4.6171040955092835e-08, + "loss": -0.0027, + "num_tokens": 67779788.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4697128534317017, + "sampling/importance_sampling_ratio/mean": 0.9992687106132507, + "sampling/importance_sampling_ratio/min": 0.5127838850021362, + "sampling/sampling_logp_difference/max": 0.667900800704956, + "sampling/sampling_logp_difference/mean": 0.01535370759665966, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 212.515625, + "completions/mean_terminated_length": 212.515625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3959088623523712, + "epoch": 2.633578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014901367515449515, + "kl": 0.02232431247830391, + "learning_rate": 4.587250844206664e-08, + "loss": 0.0002, + "num_tokens": 67810893.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5291082859039307, + "sampling/importance_sampling_ratio/mean": 1.0004322528839111, + "sampling/importance_sampling_ratio/min": 0.6461935639381409, + "sampling/sampling_logp_difference/max": 0.43665623664855957, + "sampling/sampling_logp_difference/mean": 0.01567588374018669, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 179.296875, + "completions/mean_terminated_length": 179.296875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4003340005874634, + "epoch": 2.6348039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0464033584410158, + "kl": 0.05728421360254288, + "learning_rate": 4.557489777907836e-08, + "loss": 0.0006, + "num_tokens": 67838304.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4168375730514526, + "sampling/importance_sampling_ratio/mean": 0.9998183250427246, + "sampling/importance_sampling_ratio/min": 0.6632786393165588, + "sampling/sampling_logp_difference/max": 0.41056013107299805, + "sampling/sampling_logp_difference/mean": 0.015619423240423203, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 613.0, + "completions/max_terminated_length": 613.0, + "completions/mean_length": 225.5625, + "completions/mean_terminated_length": 225.5625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3121446967124939, + "epoch": 2.6360294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013397615060628754, + "kl": 0.028556915000081062, + "learning_rate": 4.527820957025891e-08, + "loss": 0.0003, + "num_tokens": 67871972.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3516439199447632, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 0.6623451709747314, + "sampling/sampling_logp_difference/max": 0.411968469619751, + "sampling/sampling_logp_difference/mean": 0.011982333846390247, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 174.28125, + "completions/mean_terminated_length": 174.28125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3555576503276825, + "epoch": 2.6372549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9091776511384866, + "kl": 0.029665805399417877, + "learning_rate": 4.498244441786675e-08, + "loss": 0.0073, + "num_tokens": 67899238.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4009780883789062, + "sampling/importance_sampling_ratio/mean": 0.9996954202651978, + "sampling/importance_sampling_ratio/min": 0.6616666316986084, + "sampling/sampling_logp_difference/max": 0.4129934310913086, + "sampling/sampling_logp_difference/mean": 0.015006804838776588, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 201.59375, + "completions/mean_terminated_length": 201.59375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3351283669471741, + "epoch": 2.638480392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9198191991067566, + "kl": 0.037099454551935196, + "learning_rate": 4.4687602922286016e-08, + "loss": 0.0048, + "num_tokens": 67929516.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4150298833847046, + "sampling/importance_sampling_ratio/mean": 0.999969482421875, + "sampling/importance_sampling_ratio/min": 0.6585971713066101, + "sampling/sampling_logp_difference/max": 0.41764330863952637, + "sampling/sampling_logp_difference/mean": 0.01474051084369421, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 637.0, + "completions/max_terminated_length": 637.0, + "completions/mean_length": 236.046875, + "completions/mean_terminated_length": 236.046875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.39587363600730896, + "epoch": 2.639705882352941, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5191392782392061, + "kl": 0.03483106195926666, + "learning_rate": 4.4393685682026505e-08, + "loss": -0.0213, + "num_tokens": 67967247.0, + "reward": 0.03125, + "reward_std": 0.46656501293182373, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5031934976577759, + "sampling/importance_sampling_ratio/mean": 0.9996311664581299, + "sampling/importance_sampling_ratio/min": 0.6101205945014954, + "sampling/sampling_logp_difference/max": 0.4940987229347229, + "sampling/sampling_logp_difference/mean": 0.014731589704751968, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 201.8125, + "completions/mean_terminated_length": 201.8125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.45350176095962524, + "epoch": 2.6409313725490198, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3363351985516962, + "kl": 0.03747811168432236, + "learning_rate": 4.4100693293721516e-08, + "loss": 0.0536, + "num_tokens": 67996051.0, + "reward": 0.3125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5776199102401733, + "sampling/importance_sampling_ratio/mean": 1.0001548528671265, + "sampling/importance_sampling_ratio/min": 0.6348408460617065, + "sampling/sampling_logp_difference/max": 0.4559173583984375, + "sampling/sampling_logp_difference/mean": 0.017571210861206055, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 193.671875, + "completions/mean_terminated_length": 193.671875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.38786581158638, + "epoch": 2.642156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8836828028498205, + "kl": 0.06262829899787903, + "learning_rate": 4.3808626352127066e-08, + "loss": -0.0266, + "num_tokens": 68027454.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6130213737487793, + "sampling/importance_sampling_ratio/mean": 1.0003948211669922, + "sampling/importance_sampling_ratio/min": 0.6953710913658142, + "sampling/sampling_logp_difference/max": 0.47810912132263184, + "sampling/sampling_logp_difference/mean": 0.015348730608820915, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 182.6875, + "completions/mean_terminated_length": 182.6875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4030335545539856, + "epoch": 2.6433823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03937552064487977, + "kl": 0.09337794780731201, + "learning_rate": 4.351748545012057e-08, + "loss": 0.0007, + "num_tokens": 68056154.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3002655506134033, + "sampling/importance_sampling_ratio/mean": 0.9997163414955139, + "sampling/importance_sampling_ratio/min": 0.6874826550483704, + "sampling/sampling_logp_difference/max": 0.37471866607666016, + "sampling/sampling_logp_difference/mean": 0.014955861493945122, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 161.625, + "completions/mean_terminated_length": 161.625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.33162474632263184, + "epoch": 2.644607843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016589471458956482, + "kl": 0.021302416920661926, + "learning_rate": 4.322727117869951e-08, + "loss": 0.0002, + "num_tokens": 68086418.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5277650356292725, + "sampling/importance_sampling_ratio/mean": 0.9993041157722473, + "sampling/importance_sampling_ratio/min": 0.6925998330116272, + "sampling/sampling_logp_difference/max": 0.42380595207214355, + "sampling/sampling_logp_difference/mean": 0.014312541112303734, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 178.734375, + "completions/mean_terminated_length": 178.734375, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.345700740814209, + "epoch": 2.6458333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8896953174773692, + "kl": 0.034189336001873016, + "learning_rate": 4.2937984126980686e-08, + "loss": 0.0192, + "num_tokens": 68113025.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6207730770111084, + "sampling/importance_sampling_ratio/mean": 1.0003600120544434, + "sampling/importance_sampling_ratio/min": 0.6178777813911438, + "sampling/sampling_logp_difference/max": 0.48290324211120605, + "sampling/sampling_logp_difference/mean": 0.014010941609740257, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 239.34375, + "completions/mean_terminated_length": 239.34375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.42113956809043884, + "epoch": 2.6470588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7860919483765861, + "kl": 0.041868604719638824, + "learning_rate": 4.2649624882198196e-08, + "loss": 0.0206, + "num_tokens": 68148631.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.6585595607757568, + "sampling/importance_sampling_ratio/mean": 0.9998828172683716, + "sampling/importance_sampling_ratio/min": 0.620280385017395, + "sampling/sampling_logp_difference/max": 0.5059494972229004, + "sampling/sampling_logp_difference/mean": 0.015877092257142067, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 167.9375, + "completions/mean_terminated_length": 167.9375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.37317323684692383, + "epoch": 2.6482843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02156110873502598, + "kl": 0.032606083899736404, + "learning_rate": 4.2362194029703256e-08, + "loss": 0.0003, + "num_tokens": 68174243.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7885191440582275, + "sampling/importance_sampling_ratio/mean": 1.0001945495605469, + "sampling/importance_sampling_ratio/min": 0.6998572945594788, + "sampling/sampling_logp_difference/max": 0.581387996673584, + "sampling/sampling_logp_difference/mean": 0.014830888248980045, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 241.71875, + "completions/mean_terminated_length": 241.71875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.4071003794670105, + "epoch": 2.6495098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7269248362049141, + "kl": 0.03592121601104736, + "learning_rate": 4.207569215296214e-08, + "loss": 0.0128, + "num_tokens": 68207489.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4574626684188843, + "sampling/importance_sampling_ratio/mean": 0.9998969435691833, + "sampling/importance_sampling_ratio/min": 0.6248587369918823, + "sampling/sampling_logp_difference/max": 0.4702296257019043, + "sampling/sampling_logp_difference/mean": 0.01433138269931078, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 211.46875, + "completions/mean_terminated_length": 211.46875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3052547574043274, + "epoch": 2.650735294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012652424112284191, + "kl": 0.01907511055469513, + "learning_rate": 4.179011983355568e-08, + "loss": 0.0002, + "num_tokens": 68245839.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4420716762542725, + "sampling/importance_sampling_ratio/mean": 1.0002076625823975, + "sampling/importance_sampling_ratio/min": 0.7396199703216553, + "sampling/sampling_logp_difference/max": 0.36608076095581055, + "sampling/sampling_logp_difference/mean": 0.012432013638317585, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 188.765625, + "completions/mean_terminated_length": 188.765625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.3332515358924866, + "epoch": 2.6519607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0158117777399342, + "kl": 0.02347681298851967, + "learning_rate": 4.150547765117746e-08, + "loss": 0.0002, + "num_tokens": 68273536.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4440466165542603, + "sampling/importance_sampling_ratio/mean": 0.9997535943984985, + "sampling/importance_sampling_ratio/min": 0.44411587715148926, + "sampling/sampling_logp_difference/max": 0.8116698265075684, + "sampling/sampling_logp_difference/mean": 0.013610436581075191, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 202.796875, + "completions/mean_terminated_length": 202.796875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3937033414840698, + "epoch": 2.653186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0443737361684446, + "kl": 0.04720392823219299, + "learning_rate": 4.1221766183633045e-08, + "loss": -0.024, + "num_tokens": 68310051.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6483078002929688, + "sampling/importance_sampling_ratio/mean": 0.9999201893806458, + "sampling/importance_sampling_ratio/min": 0.614530086517334, + "sampling/sampling_logp_difference/max": 0.49974918365478516, + "sampling/sampling_logp_difference/mean": 0.015470354817807674, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 135.96875, + "completions/mean_terminated_length": 135.96875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.23793260753154755, + "epoch": 2.6544117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014384866276741116, + "kl": 0.017748642712831497, + "learning_rate": 4.0938986006838926e-08, + "loss": 0.0002, + "num_tokens": 68334193.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007353067398071, + "sampling/importance_sampling_ratio/mean": 0.9994776844978333, + "sampling/importance_sampling_ratio/min": 0.6216983199119568, + "sampling/sampling_logp_difference/max": 0.47530031204223633, + "sampling/sampling_logp_difference/mean": 0.011151906102895737, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 206.046875, + "completions/mean_terminated_length": 206.046875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.456847608089447, + "epoch": 2.655637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03623937633238227, + "kl": 0.05674976482987404, + "learning_rate": 4.065713769482082e-08, + "loss": 0.0006, + "num_tokens": 68366068.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5402013063430786, + "sampling/importance_sampling_ratio/mean": 1.0000441074371338, + "sampling/importance_sampling_ratio/min": 0.4165976345539093, + "sampling/sampling_logp_difference/max": 0.8756344318389893, + "sampling/sampling_logp_difference/mean": 0.017204947769641876, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 164.03125, + "completions/mean_terminated_length": 164.03125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3916319012641907, + "epoch": 2.656862745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9749861504971195, + "kl": 0.03285214304924011, + "learning_rate": 4.037622181971295e-08, + "loss": 0.0002, + "num_tokens": 68393654.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6085870265960693, + "sampling/importance_sampling_ratio/mean": 0.9998897314071655, + "sampling/importance_sampling_ratio/min": 0.7179526686668396, + "sampling/sampling_logp_difference/max": 0.4753561019897461, + "sampling/sampling_logp_difference/mean": 0.01594218611717224, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 187.296875, + "completions/mean_terminated_length": 187.296875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.4151933789253235, + "epoch": 2.6580882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0280970633762743, + "kl": 0.03589069843292236, + "learning_rate": 4.009623895175662e-08, + "loss": 0.0279, + "num_tokens": 68422777.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6305079460144043, + "sampling/importance_sampling_ratio/mean": 0.9992147088050842, + "sampling/importance_sampling_ratio/min": 0.4902857542037964, + "sampling/sampling_logp_difference/max": 0.7127668857574463, + "sampling/sampling_logp_difference/mean": 0.016101282089948654, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 196.53125, + "completions/mean_terminated_length": 196.53125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.3541117310523987, + "epoch": 2.659313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01327160963277398, + "kl": 0.02483142912387848, + "learning_rate": 3.981718965929959e-08, + "loss": 0.0002, + "num_tokens": 68457851.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.469534158706665, + "sampling/importance_sampling_ratio/mean": 1.0002583265304565, + "sampling/importance_sampling_ratio/min": 0.6208575963973999, + "sampling/sampling_logp_difference/max": 0.4766535758972168, + "sampling/sampling_logp_difference/mean": 0.013692292384803295, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 178.90625, + "completions/mean_terminated_length": 178.90625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.33619722723960876, + "epoch": 2.6605392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019335190864553025, + "kl": 0.026515118777751923, + "learning_rate": 3.953907450879407e-08, + "loss": 0.0003, + "num_tokens": 68484277.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3703358173370361, + "sampling/importance_sampling_ratio/mean": 1.000185489654541, + "sampling/importance_sampling_ratio/min": 0.6254917979240417, + "sampling/sampling_logp_difference/max": 0.46921706199645996, + "sampling/sampling_logp_difference/mean": 0.014374321326613426, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 217.03125, + "completions/mean_terminated_length": 217.03125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.4130726456642151, + "epoch": 2.661764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015376443809146078, + "kl": 0.031047407537698746, + "learning_rate": 3.926189406479613e-08, + "loss": 0.0003, + "num_tokens": 68521319.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5721830129623413, + "sampling/importance_sampling_ratio/mean": 1.000622272491455, + "sampling/importance_sampling_ratio/min": 0.6926271915435791, + "sampling/sampling_logp_difference/max": 0.4524650573730469, + "sampling/sampling_logp_difference/mean": 0.014572503045201302, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 186.9375, + "completions/mean_terminated_length": 186.9375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.43158823251724243, + "epoch": 2.6629901960784315, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2637325352312798, + "kl": 0.06611524522304535, + "learning_rate": 3.898564888996475e-08, + "loss": 0.0187, + "num_tokens": 68550451.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.8852076530456543, + "sampling/importance_sampling_ratio/mean": 1.0006320476531982, + "sampling/importance_sampling_ratio/min": 0.6424117088317871, + "sampling/sampling_logp_difference/max": 0.634037971496582, + "sampling/sampling_logp_difference/mean": 0.01655680499970913, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 248.96875, + "completions/mean_terminated_length": 248.96875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.34359270334243774, + "epoch": 2.6642156862745097, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8861158320358442, + "kl": 0.06602966040372849, + "learning_rate": 3.871033954505998e-08, + "loss": 0.0358, + "num_tokens": 68580289.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.354966402053833, + "sampling/importance_sampling_ratio/mean": 0.9997091889381409, + "sampling/importance_sampling_ratio/min": 0.2973400950431824, + "sampling/sampling_logp_difference/max": 1.212878704071045, + "sampling/sampling_logp_difference/mean": 0.013687359169125557, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 174.78125, + "completions/mean_terminated_length": 174.78125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3491934835910797, + "epoch": 2.6654411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01935570487676279, + "kl": 0.02674412727355957, + "learning_rate": 3.843596658894232e-08, + "loss": 0.0003, + "num_tokens": 68609683.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4011064767837524, + "sampling/importance_sampling_ratio/mean": 0.9998527765274048, + "sampling/importance_sampling_ratio/min": 0.7132253646850586, + "sampling/sampling_logp_difference/max": 0.33795785903930664, + "sampling/sampling_logp_difference/mean": 0.013049756176769733, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 165.3125, + "completions/mean_terminated_length": 165.3125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3384281396865845, + "epoch": 2.6666666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8721263736102155, + "kl": 0.058564648032188416, + "learning_rate": 3.816253057857144e-08, + "loss": 0.0009, + "num_tokens": 68636439.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4753774404525757, + "sampling/importance_sampling_ratio/mean": 0.9996055364608765, + "sampling/importance_sampling_ratio/min": 0.6482266783714294, + "sampling/sampling_logp_difference/max": 0.4335148334503174, + "sampling/sampling_logp_difference/mean": 0.014768009074032307, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 174.359375, + "completions/mean_terminated_length": 174.359375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.38247376680374146, + "epoch": 2.667892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016492188096115544, + "kl": 0.023220643401145935, + "learning_rate": 3.789003206900537e-08, + "loss": 0.0002, + "num_tokens": 68669246.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.541258454322815, + "sampling/importance_sampling_ratio/mean": 0.9998940229415894, + "sampling/importance_sampling_ratio/min": 0.7146031856536865, + "sampling/sampling_logp_difference/max": 0.4325993061065674, + "sampling/sampling_logp_difference/mean": 0.015042722225189209, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 226.609375, + "completions/mean_terminated_length": 226.609375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.406279981136322, + "epoch": 2.6691176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8099497129356059, + "kl": 0.02597496472299099, + "learning_rate": 3.7618471613398597e-08, + "loss": 0.0241, + "num_tokens": 68708469.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5932286977767944, + "sampling/importance_sampling_ratio/mean": 0.9998207092285156, + "sampling/importance_sampling_ratio/min": 0.3276415169239044, + "sampling/sampling_logp_difference/max": 1.115835189819336, + "sampling/sampling_logp_difference/mean": 0.015545186586678028, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 217.453125, + "completions/mean_terminated_length": 217.453125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4454168379306793, + "epoch": 2.670343137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7105919119139977, + "kl": 0.028678320348262787, + "learning_rate": 3.734784976300165e-08, + "loss": -0.0295, + "num_tokens": 68744546.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4453904628753662, + "sampling/importance_sampling_ratio/mean": 0.9995754361152649, + "sampling/importance_sampling_ratio/min": 0.6882218718528748, + "sampling/sampling_logp_difference/max": 0.37364399433135986, + "sampling/sampling_logp_difference/mean": 0.016125116497278214, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 198.125, + "completions/mean_terminated_length": 198.125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.4045882225036621, + "epoch": 2.6715686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01982988541081753, + "kl": 0.02784121036529541, + "learning_rate": 3.7078167067159826e-08, + "loss": 0.0003, + "num_tokens": 68772218.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4922415018081665, + "sampling/importance_sampling_ratio/mean": 1.000109314918518, + "sampling/importance_sampling_ratio/min": 0.6838484406471252, + "sampling/sampling_logp_difference/max": 0.4002794027328491, + "sampling/sampling_logp_difference/mean": 0.016974179074168205, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 198.46875, + "completions/mean_terminated_length": 198.46875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.4650859236717224, + "epoch": 2.672794117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01901885931020824, + "kl": 0.028293920680880547, + "learning_rate": 3.6809424073311944e-08, + "loss": 0.0003, + "num_tokens": 68804360.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999034404754639, + "sampling/importance_sampling_ratio/min": 0.6299536228179932, + "sampling/sampling_logp_difference/max": 0.7612266540527344, + "sampling/sampling_logp_difference/mean": 0.017532266676425934, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 195.3125, + "completions/mean_terminated_length": 195.3125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.32273679971694946, + "epoch": 2.674019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7491038958775039, + "kl": 0.03821868821978569, + "learning_rate": 3.654162132698918e-08, + "loss": 0.0003, + "num_tokens": 68832028.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.3335254192352295, + "sampling/importance_sampling_ratio/mean": 1.0005215406417847, + "sampling/importance_sampling_ratio/min": 0.6803061366081238, + "sampling/sampling_logp_difference/max": 0.38521242141723633, + "sampling/sampling_logp_difference/mean": 0.013480523601174355, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 198.4375, + "completions/mean_terminated_length": 198.4375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3952333331108093, + "epoch": 2.6752450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.838085929295133, + "kl": 0.05547407269477844, + "learning_rate": 3.627475937181407e-08, + "loss": 0.0126, + "num_tokens": 68866056.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.3003922700881958, + "sampling/importance_sampling_ratio/mean": 0.9999629855155945, + "sampling/importance_sampling_ratio/min": 0.47558560967445374, + "sampling/sampling_logp_difference/max": 0.7432084083557129, + "sampling/sampling_logp_difference/mean": 0.013009263202548027, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 186.984375, + "completions/mean_terminated_length": 186.984375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.39547306299209595, + "epoch": 2.6764705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01706766127270785, + "kl": 0.024384262040257454, + "learning_rate": 3.600883874949967e-08, + "loss": 0.0002, + "num_tokens": 68893815.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4879440069198608, + "sampling/importance_sampling_ratio/mean": 1.0000054836273193, + "sampling/importance_sampling_ratio/min": 0.6621916890144348, + "sampling/sampling_logp_difference/max": 0.4122002124786377, + "sampling/sampling_logp_difference/mean": 0.015582316555082798, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 222.609375, + "completions/mean_terminated_length": 222.609375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3833645284175873, + "epoch": 2.6776960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8444642619222315, + "kl": 0.023905735462903976, + "learning_rate": 3.574385999984786e-08, + "loss": -0.0109, + "num_tokens": 68926190.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5958937406539917, + "sampling/importance_sampling_ratio/mean": 0.9997842311859131, + "sampling/importance_sampling_ratio/min": 0.6492242813110352, + "sampling/sampling_logp_difference/max": 0.4674339294433594, + "sampling/sampling_logp_difference/mean": 0.015461385250091553, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 157.1875, + "completions/mean_terminated_length": 157.1875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3409844636917114, + "epoch": 2.678921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019021331515509558, + "kl": 0.027882926166057587, + "learning_rate": 3.54798236607487e-08, + "loss": 0.0003, + "num_tokens": 68950954.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3148980140686035, + "sampling/importance_sampling_ratio/mean": 0.999673068523407, + "sampling/importance_sampling_ratio/min": 0.6211196184158325, + "sampling/sampling_logp_difference/max": 0.47623157501220703, + "sampling/sampling_logp_difference/mean": 0.014010068029165268, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 158.390625, + "completions/mean_terminated_length": 158.390625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.4011605978012085, + "epoch": 2.6801470588235294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0054874252543198, + "kl": 0.0786287933588028, + "learning_rate": 3.5216730268179337e-08, + "loss": 0.0091, + "num_tokens": 68983491.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5853873491287231, + "sampling/importance_sampling_ratio/mean": 1.0001468658447266, + "sampling/importance_sampling_ratio/min": 0.6642369627952576, + "sampling/sampling_logp_difference/max": 0.4608287811279297, + "sampling/sampling_logp_difference/mean": 0.01586076244711876, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 184.90625, + "completions/mean_terminated_length": 184.90625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3708415627479553, + "epoch": 2.681372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020542560990797393, + "kl": 0.030698316171765327, + "learning_rate": 3.495458035620252e-08, + "loss": 0.0003, + "num_tokens": 69012285.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4295809268951416, + "sampling/importance_sampling_ratio/mean": 0.9994001388549805, + "sampling/importance_sampling_ratio/min": 0.6445699334144592, + "sampling/sampling_logp_difference/max": 0.43917202949523926, + "sampling/sampling_logp_difference/mean": 0.014541094191372395, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 169.0625, + "completions/mean_terminated_length": 169.0625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.41730359196662903, + "epoch": 2.6825980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020414554045821067, + "kl": 0.027429096400737762, + "learning_rate": 3.469337445696629e-08, + "loss": 0.0003, + "num_tokens": 69040561.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4753774404525757, + "sampling/importance_sampling_ratio/mean": 1.0000537633895874, + "sampling/importance_sampling_ratio/min": 0.6311920881271362, + "sampling/sampling_logp_difference/max": 0.4601449966430664, + "sampling/sampling_logp_difference/mean": 0.016278889030218124, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 189.015625, + "completions/mean_terminated_length": 189.015625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4509800970554352, + "epoch": 2.6838235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01918904970116333, + "kl": 0.03321604058146477, + "learning_rate": 3.4433113100701683e-08, + "loss": 0.0003, + "num_tokens": 69069698.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4652029275894165, + "sampling/importance_sampling_ratio/mean": 0.9996045827865601, + "sampling/importance_sampling_ratio/min": 0.679466187953949, + "sampling/sampling_logp_difference/max": 0.3864477872848511, + "sampling/sampling_logp_difference/mean": 0.017754685133695602, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 215.609375, + "completions/mean_terminated_length": 215.609375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3309880197048187, + "epoch": 2.685049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01778985067857809, + "kl": 0.026937957853078842, + "learning_rate": 3.417379681572296e-08, + "loss": 0.0003, + "num_tokens": 69101145.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5628186464309692, + "sampling/importance_sampling_ratio/mean": 1.000060796737671, + "sampling/importance_sampling_ratio/min": 0.5163277387619019, + "sampling/sampling_logp_difference/max": 0.6610136032104492, + "sampling/sampling_logp_difference/mean": 0.013586295768618584, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 195.4375, + "completions/mean_terminated_length": 195.4375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.429769366979599, + "epoch": 2.686274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015703317269527433, + "kl": 0.026214124634861946, + "learning_rate": 3.391542612842574e-08, + "loss": 0.0003, + "num_tokens": 69133029.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4691550731658936, + "sampling/importance_sampling_ratio/mean": 0.9995607137680054, + "sampling/importance_sampling_ratio/min": 0.6254510879516602, + "sampling/sampling_logp_difference/max": 0.4692821502685547, + "sampling/sampling_logp_difference/mean": 0.016176287084817886, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 187.671875, + "completions/mean_terminated_length": 187.671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.32622671127319336, + "epoch": 2.6875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9161237725815474, + "kl": 0.033167362213134766, + "learning_rate": 3.365800156328619e-08, + "loss": 0.0177, + "num_tokens": 69164480.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5998271703720093, + "sampling/importance_sampling_ratio/mean": 0.9996638298034668, + "sampling/importance_sampling_ratio/min": 0.5369217395782471, + "sampling/sampling_logp_difference/max": 0.6219029426574707, + "sampling/sampling_logp_difference/mean": 0.013408014550805092, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 200.609375, + "completions/mean_terminated_length": 200.609375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4098494052886963, + "epoch": 2.688725490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03817098702871339, + "kl": 0.025800131261348724, + "learning_rate": 3.3401523642859805e-08, + "loss": 0.0002, + "num_tokens": 69198823.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3600707054138184, + "sampling/importance_sampling_ratio/mean": 0.9994595050811768, + "sampling/importance_sampling_ratio/min": 0.6952628493309021, + "sampling/sampling_logp_difference/max": 0.3634653091430664, + "sampling/sampling_logp_difference/mean": 0.01569507271051407, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 183.734375, + "completions/mean_terminated_length": 183.734375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3755292296409607, + "epoch": 2.689950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9980885619171185, + "kl": 0.026967518031597137, + "learning_rate": 3.3145992887780475e-08, + "loss": -0.0563, + "num_tokens": 69227302.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4975247383117676, + "sampling/importance_sampling_ratio/mean": 0.998956024646759, + "sampling/importance_sampling_ratio/min": 0.5228726863861084, + "sampling/sampling_logp_difference/max": 0.6484172344207764, + "sampling/sampling_logp_difference/mean": 0.0146570960059762, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 240.875, + "completions/mean_terminated_length": 240.875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.35742872953414917, + "epoch": 2.6911764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010959171189284437, + "kl": 0.01894654706120491, + "learning_rate": 3.289140981675964e-08, + "loss": 0.0002, + "num_tokens": 69260462.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6141589879989624, + "sampling/importance_sampling_ratio/mean": 1.0003001689910889, + "sampling/importance_sampling_ratio/min": 0.6132730841636658, + "sampling/sampling_logp_difference/max": 0.48894500732421875, + "sampling/sampling_logp_difference/mean": 0.01310389768332243, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 245.640625, + "completions/mean_terminated_length": 245.640625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.4439132809638977, + "epoch": 2.6924019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0722411814635506, + "kl": 0.0608699694275856, + "learning_rate": 3.263777494658448e-08, + "loss": 0.0081, + "num_tokens": 69297735.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5792343616485596, + "sampling/importance_sampling_ratio/mean": 1.0005708932876587, + "sampling/importance_sampling_ratio/min": 0.6708058714866638, + "sampling/sampling_logp_difference/max": 0.4569401741027832, + "sampling/sampling_logp_difference/mean": 0.014909801073372364, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 212.875, + "completions/mean_terminated_length": 212.875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.33329761028289795, + "epoch": 2.693627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014752819647576854, + "kl": 0.02324068918824196, + "learning_rate": 3.2385088792118044e-08, + "loss": 0.0002, + "num_tokens": 69327407.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6179770231246948, + "sampling/importance_sampling_ratio/mean": 1.0001076459884644, + "sampling/importance_sampling_ratio/min": 0.6605718731880188, + "sampling/sampling_logp_difference/max": 0.48117661476135254, + "sampling/sampling_logp_difference/mean": 0.014407221227884293, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 235.28125, + "completions/mean_terminated_length": 235.28125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.41965699195861816, + "epoch": 2.6948529411764706, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0650251009076923, + "kl": 0.04170294106006622, + "learning_rate": 3.2133351866296955e-08, + "loss": -0.0279, + "num_tokens": 69362321.0, + "reward": 0.25, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5023787021636963, + "sampling/importance_sampling_ratio/mean": 1.0000550746917725, + "sampling/importance_sampling_ratio/min": 0.6670454740524292, + "sampling/sampling_logp_difference/max": 0.40704965591430664, + "sampling/sampling_logp_difference/mean": 0.014436185359954834, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 221.9375, + "completions/mean_terminated_length": 221.9375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.37368834018707275, + "epoch": 2.696078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012657824622548918, + "kl": 0.018766485154628754, + "learning_rate": 3.188256468013139e-08, + "loss": 0.0002, + "num_tokens": 69394909.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4549809694290161, + "sampling/importance_sampling_ratio/mean": 1.0002429485321045, + "sampling/importance_sampling_ratio/min": 0.6784660816192627, + "sampling/sampling_logp_difference/max": 0.3879207372665405, + "sampling/sampling_logp_difference/mean": 0.013512177392840385, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 181.984375, + "completions/mean_terminated_length": 181.984375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.44905218482017517, + "epoch": 2.6973039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.373832166490122, + "kl": 0.04382198676466942, + "learning_rate": 3.163272774270348e-08, + "loss": -0.0308, + "num_tokens": 69420956.0, + "reward": 0.15625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5140695571899414, + "sampling/importance_sampling_ratio/mean": 1.000220775604248, + "sampling/importance_sampling_ratio/min": 0.6410974264144897, + "sampling/sampling_logp_difference/max": 0.44457387924194336, + "sampling/sampling_logp_difference/mean": 0.01640354096889496, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 145.078125, + "completions/mean_terminated_length": 145.078125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.35924413800239563, + "epoch": 2.6985294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2076341985162982, + "kl": 0.07685059309005737, + "learning_rate": 3.1383841561166134e-08, + "loss": -0.0304, + "num_tokens": 69442785.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6274707317352295, + "sampling/importance_sampling_ratio/mean": 1.0003020763397217, + "sampling/importance_sampling_ratio/min": 0.6347670555114746, + "sampling/sampling_logp_difference/max": 0.4870271682739258, + "sampling/sampling_logp_difference/mean": 0.015782007947564125, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 197.46875, + "completions/mean_terminated_length": 197.46875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4621891975402832, + "epoch": 2.6997549019607843, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8831331827097225, + "kl": 0.053350359201431274, + "learning_rate": 3.1135906640742836e-08, + "loss": 0.0105, + "num_tokens": 69473791.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5071722269058228, + "sampling/importance_sampling_ratio/mean": 0.9998569488525391, + "sampling/importance_sampling_ratio/min": 0.6598401665687561, + "sampling/sampling_logp_difference/max": 0.4157576560974121, + "sampling/sampling_logp_difference/mean": 0.016618210822343826, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 205.828125, + "completions/mean_terminated_length": 205.828125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.3371545076370239, + "epoch": 2.700980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013762829006671326, + "kl": 0.020944753661751747, + "learning_rate": 3.088892348472561e-08, + "loss": 0.0002, + "num_tokens": 69505444.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6021487712860107, + "sampling/importance_sampling_ratio/mean": 0.9997724294662476, + "sampling/importance_sampling_ratio/min": 0.6512129306793213, + "sampling/sampling_logp_difference/max": 0.4713456630706787, + "sampling/sampling_logp_difference/mean": 0.013815833255648613, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 205.234375, + "completions/mean_terminated_length": 205.234375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4217361509799957, + "epoch": 2.702205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8403160454799576, + "kl": 0.03663209453225136, + "learning_rate": 3.064289259447455e-08, + "loss": -0.0207, + "num_tokens": 69532451.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4563947916030884, + "sampling/importance_sampling_ratio/mean": 1.000288486480713, + "sampling/importance_sampling_ratio/min": 0.6954967975616455, + "sampling/sampling_logp_difference/max": 0.37596404552459717, + "sampling/sampling_logp_difference/mean": 0.0165574848651886, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 227.109375, + "completions/mean_terminated_length": 227.109375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.5034979581832886, + "epoch": 2.7034313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8202365148511331, + "kl": 0.04638688638806343, + "learning_rate": 3.039781446941697e-08, + "loss": 0.0061, + "num_tokens": 69566266.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4162613153457642, + "sampling/importance_sampling_ratio/mean": 1.0002670288085938, + "sampling/importance_sampling_ratio/min": 0.7326375842094421, + "sampling/sampling_logp_difference/max": 0.3480205535888672, + "sampling/sampling_logp_difference/mean": 0.016674406826496124, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 178.96875, + "completions/mean_terminated_length": 178.96875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.4002281427383423, + "epoch": 2.704656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021476810741575578, + "kl": 0.02550198882818222, + "learning_rate": 3.015368960704584e-08, + "loss": 0.0003, + "num_tokens": 69595512.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4954711198806763, + "sampling/importance_sampling_ratio/mean": 0.9999463558197021, + "sampling/importance_sampling_ratio/min": 0.602114737033844, + "sampling/sampling_logp_difference/max": 0.5073072910308838, + "sampling/sampling_logp_difference/mean": 0.0180397629737854, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 227.09375, + "completions/mean_terminated_length": 227.09375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.36214518547058105, + "epoch": 2.7058823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5992061058659258, + "kl": 0.03874623402953148, + "learning_rate": 2.991051850291915e-08, + "loss": 0.0026, + "num_tokens": 69624574.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5040321350097656, + "sampling/importance_sampling_ratio/mean": 0.9997340440750122, + "sampling/importance_sampling_ratio/min": 0.6141670942306519, + "sampling/sampling_logp_difference/max": 0.4874882698059082, + "sampling/sampling_logp_difference/mean": 0.014499923214316368, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 228.375, + "completions/mean_terminated_length": 228.375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.3309875726699829, + "epoch": 2.707107843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0591527126619875, + "kl": 0.027056990191340446, + "learning_rate": 2.9668301650658756e-08, + "loss": -0.019, + "num_tokens": 69659990.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3536651134490967, + "sampling/importance_sampling_ratio/mean": 1.0002684593200684, + "sampling/importance_sampling_ratio/min": 0.5260624885559082, + "sampling/sampling_logp_difference/max": 0.6423352956771851, + "sampling/sampling_logp_difference/mean": 0.01373043842613697, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 188.71875, + "completions/mean_terminated_length": 188.71875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3033469319343567, + "epoch": 2.7083333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011828145824959092, + "kl": 0.019223835319280624, + "learning_rate": 2.9427039541949638e-08, + "loss": 0.0002, + "num_tokens": 69688420.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4354578256607056, + "sampling/importance_sampling_ratio/mean": 0.9997765421867371, + "sampling/importance_sampling_ratio/min": 0.6278591156005859, + "sampling/sampling_logp_difference/max": 0.4654395580291748, + "sampling/sampling_logp_difference/mean": 0.013137388974428177, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 185.4375, + "completions/mean_terminated_length": 185.4375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3659963607788086, + "epoch": 2.7095588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01739920974455112, + "kl": 0.028903864324092865, + "learning_rate": 2.918673266653865e-08, + "loss": 0.0003, + "num_tokens": 69718048.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6234164237976074, + "sampling/importance_sampling_ratio/mean": 0.9997466802597046, + "sampling/importance_sampling_ratio/min": 0.5808159708976746, + "sampling/sampling_logp_difference/max": 0.5433213710784912, + "sampling/sampling_logp_difference/mean": 0.014746871776878834, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 187.859375, + "completions/mean_terminated_length": 187.859375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.39722689986228943, + "epoch": 2.7107843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8294509745509189, + "kl": 0.05414985120296478, + "learning_rate": 2.8947381512233305e-08, + "loss": -0.0086, + "num_tokens": 69747591.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5189282894134521, + "sampling/importance_sampling_ratio/mean": 0.9999840259552002, + "sampling/importance_sampling_ratio/min": 0.577173113822937, + "sampling/sampling_logp_difference/max": 0.5496129989624023, + "sampling/sampling_logp_difference/mean": 0.016706984490156174, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 197.46875, + "completions/mean_terminated_length": 197.46875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4055330753326416, + "epoch": 2.7120098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0365294035465076, + "kl": 0.04686344042420387, + "learning_rate": 2.8708986564901504e-08, + "loss": 0.016, + "num_tokens": 69778485.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999788761138916, + "sampling/importance_sampling_ratio/min": 0.680418848991394, + "sampling/sampling_logp_difference/max": 0.709975004196167, + "sampling/sampling_logp_difference/mean": 0.015084546059370041, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 187.78125, + "completions/mean_terminated_length": 187.78125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.2596806287765503, + "epoch": 2.713235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016154535797120406, + "kl": 0.019631171599030495, + "learning_rate": 2.8471548308469706e-08, + "loss": 0.0002, + "num_tokens": 69802615.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6073639392852783, + "sampling/importance_sampling_ratio/mean": 0.9996476173400879, + "sampling/importance_sampling_ratio/min": 0.6554355621337891, + "sampling/sampling_logp_difference/max": 0.4745955467224121, + "sampling/sampling_logp_difference/mean": 0.013249853625893593, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 183.921875, + "completions/mean_terminated_length": 183.921875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3703579902648926, + "epoch": 2.7144607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.930933398269968, + "kl": 0.04081626236438751, + "learning_rate": 2.8235067224922802e-08, + "loss": 0.0157, + "num_tokens": 69829122.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4594871997833252, + "sampling/importance_sampling_ratio/mean": 0.9999802708625793, + "sampling/importance_sampling_ratio/min": 0.6332386136054993, + "sampling/sampling_logp_difference/max": 0.4569079875946045, + "sampling/sampling_logp_difference/mean": 0.015322159975767136, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 233.03125, + "completions/mean_terminated_length": 233.03125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.36985495686531067, + "epoch": 2.715686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018448594596314177, + "kl": 0.022692425176501274, + "learning_rate": 2.799954379430208e-08, + "loss": 0.0002, + "num_tokens": 69865732.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5356004238128662, + "sampling/importance_sampling_ratio/mean": 1.0003145933151245, + "sampling/importance_sampling_ratio/min": 0.6561621427536011, + "sampling/sampling_logp_difference/max": 0.4289214611053467, + "sampling/sampling_logp_difference/mean": 0.013818234205245972, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 172.84375, + "completions/mean_terminated_length": 172.84375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.45772528648376465, + "epoch": 2.7169117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021401507381491623, + "kl": 0.030462948605418205, + "learning_rate": 2.7764978494705437e-08, + "loss": 0.0003, + "num_tokens": 69897178.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001213550567627, + "sampling/importance_sampling_ratio/min": 0.6583269238471985, + "sampling/sampling_logp_difference/max": 0.9321606159210205, + "sampling/sampling_logp_difference/mean": 0.017726056277751923, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 167.3125, + "completions/mean_terminated_length": 167.3125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.27990877628326416, + "epoch": 2.718137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014808584355121727, + "kl": 0.024075474590063095, + "learning_rate": 2.753137180228543e-08, + "loss": 0.0002, + "num_tokens": 69920606.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6250072717666626, + "sampling/importance_sampling_ratio/mean": 1.0006778240203857, + "sampling/importance_sampling_ratio/min": 0.6498124599456787, + "sampling/sampling_logp_difference/max": 0.48551225662231445, + "sampling/sampling_logp_difference/mean": 0.012928958982229233, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 246.40625, + "completions/mean_terminated_length": 246.40625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.43565645813941956, + "epoch": 2.719362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024239787636330484, + "kl": 0.04958684742450714, + "learning_rate": 2.729872419124879e-08, + "loss": 0.0005, + "num_tokens": 69953832.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3965340852737427, + "sampling/importance_sampling_ratio/mean": 0.9995464086532593, + "sampling/importance_sampling_ratio/min": 0.6970493793487549, + "sampling/sampling_logp_difference/max": 0.3608989715576172, + "sampling/sampling_logp_difference/mean": 0.014671550132334232, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 169.765625, + "completions/mean_terminated_length": 169.765625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.370225727558136, + "epoch": 2.7205882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025148167441551832, + "kl": 0.029639123007655144, + "learning_rate": 2.7067036133855636e-08, + "loss": 0.0003, + "num_tokens": 69983833.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3967820405960083, + "sampling/importance_sampling_ratio/mean": 1.0000736713409424, + "sampling/importance_sampling_ratio/min": 0.6781371235847473, + "sampling/sampling_logp_difference/max": 0.38840579986572266, + "sampling/sampling_logp_difference/mean": 0.015588011592626572, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 165.875, + "completions/mean_terminated_length": 165.875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.36063680052757263, + "epoch": 2.721813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02809174824148526, + "kl": 0.04175316542387009, + "learning_rate": 2.6836308100417872e-08, + "loss": 0.0005, + "num_tokens": 70012609.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7515075206756592, + "sampling/importance_sampling_ratio/mean": 1.0000413656234741, + "sampling/importance_sampling_ratio/min": 0.6815227270126343, + "sampling/sampling_logp_difference/max": 0.5604767799377441, + "sampling/sampling_logp_difference/mean": 0.014000261202454567, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 173.265625, + "completions/mean_terminated_length": 173.265625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3790750205516815, + "epoch": 2.7230392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017831589623363682, + "kl": 0.039492081850767136, + "learning_rate": 2.6606540559298952e-08, + "loss": 0.0004, + "num_tokens": 70042258.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.531073808670044, + "sampling/importance_sampling_ratio/mean": 1.0003293752670288, + "sampling/importance_sampling_ratio/min": 0.6711210012435913, + "sampling/sampling_logp_difference/max": 0.42596936225891113, + "sampling/sampling_logp_difference/mean": 0.013877512887120247, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 189.171875, + "completions/mean_terminated_length": 189.171875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3294864892959595, + "epoch": 2.724264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016183732643314658, + "kl": 0.022651515901088715, + "learning_rate": 2.6377733976912232e-08, + "loss": 0.0002, + "num_tokens": 70068909.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3656457662582397, + "sampling/importance_sampling_ratio/mean": 1.000307321548462, + "sampling/importance_sampling_ratio/min": 0.6056224703788757, + "sampling/sampling_logp_difference/max": 0.5014984607696533, + "sampling/sampling_logp_difference/mean": 0.01442730613052845, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 204.765625, + "completions/mean_terminated_length": 204.765625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.35736751556396484, + "epoch": 2.7254901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019247193422217267, + "kl": 0.027182430028915405, + "learning_rate": 2.6149888817720733e-08, + "loss": 0.0003, + "num_tokens": 70101326.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6362637281417847, + "sampling/importance_sampling_ratio/mean": 0.9998779892921448, + "sampling/importance_sampling_ratio/min": 0.6739427447319031, + "sampling/sampling_logp_difference/max": 0.4924154281616211, + "sampling/sampling_logp_difference/mean": 0.014462153427302837, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 212.390625, + "completions/mean_terminated_length": 212.390625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.38081106543540955, + "epoch": 2.7267156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013739196201482876, + "kl": 0.021417230367660522, + "learning_rate": 2.5923005544235545e-08, + "loss": 0.0002, + "num_tokens": 70138359.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.567293643951416, + "sampling/importance_sampling_ratio/mean": 1.0004489421844482, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4679851531982422, + "sampling/sampling_logp_difference/mean": 0.014387840405106544, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 326.515625, + "completions/mean_terminated_length": 326.515625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.4487899839878082, + "epoch": 2.7279411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5790579301037666, + "kl": 0.03546018898487091, + "learning_rate": 2.5697084617015475e-08, + "loss": -0.006, + "num_tokens": 70182664.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.5986748933792114, + "sampling/importance_sampling_ratio/mean": 0.9997212290763855, + "sampling/importance_sampling_ratio/min": 0.7025700807571411, + "sampling/sampling_logp_difference/max": 0.4691751003265381, + "sampling/sampling_logp_difference/mean": 0.0149573078379035, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 205.171875, + "completions/mean_terminated_length": 205.171875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.39314544200897217, + "epoch": 2.7291666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044041662114271914, + "kl": 0.05147264525294304, + "learning_rate": 2.547212649466568e-08, + "loss": 0.0006, + "num_tokens": 70217875.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6088998317718506, + "sampling/importance_sampling_ratio/mean": 0.999459445476532, + "sampling/importance_sampling_ratio/min": 0.42561814188957214, + "sampling/sampling_logp_difference/max": 0.854212760925293, + "sampling/sampling_logp_difference/mean": 0.015132260508835316, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 144.3125, + "completions/mean_terminated_length": 144.3125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.38113486766815186, + "epoch": 2.730392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0513687121081103, + "kl": 0.035049207508563995, + "learning_rate": 2.5248131633836823e-08, + "loss": 0.0093, + "num_tokens": 70250679.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5746089220046997, + "sampling/importance_sampling_ratio/mean": 0.9993321299552917, + "sampling/importance_sampling_ratio/min": 0.5483723282814026, + "sampling/sampling_logp_difference/max": 0.6008007526397705, + "sampling/sampling_logp_difference/mean": 0.01642269268631935, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 220.15625, + "completions/mean_terminated_length": 220.15625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.47911232709884644, + "epoch": 2.7316176470588234, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2832749805390427, + "kl": 0.04859183728694916, + "learning_rate": 2.5025100489224406e-08, + "loss": -0.0241, + "num_tokens": 70283617.0, + "reward": 0.21875, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5856502056121826, + "sampling/importance_sampling_ratio/mean": 0.9997179508209229, + "sampling/importance_sampling_ratio/min": 0.6418494582176208, + "sampling/sampling_logp_difference/max": 0.4609944820404053, + "sampling/sampling_logp_difference/mean": 0.017129074782133102, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 237.40625, + "completions/mean_terminated_length": 237.40625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4551759958267212, + "epoch": 2.732843137254902, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8730456578928805, + "kl": 0.07192547619342804, + "learning_rate": 2.480303351356733e-08, + "loss": 0.0153, + "num_tokens": 70320075.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4353076219558716, + "sampling/importance_sampling_ratio/mean": 0.9998223781585693, + "sampling/importance_sampling_ratio/min": 0.6555653810501099, + "sampling/sampling_logp_difference/max": 0.4222571849822998, + "sampling/sampling_logp_difference/mean": 0.015772780403494835, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 167.3125, + "completions/mean_terminated_length": 167.3125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3926674723625183, + "epoch": 2.7340686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018773129247530156, + "kl": 0.028011813759803772, + "learning_rate": 2.4581931157647674e-08, + "loss": 0.0003, + "num_tokens": 70348191.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6062246561050415, + "sampling/importance_sampling_ratio/mean": 1.0012869834899902, + "sampling/importance_sampling_ratio/min": 0.6412144899368286, + "sampling/sampling_logp_difference/max": 0.47388648986816406, + "sampling/sampling_logp_difference/mean": 0.015152723528444767, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 200.546875, + "completions/mean_terminated_length": 200.546875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3412466049194336, + "epoch": 2.735294117647059, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7558585756028967, + "kl": 0.023498691618442535, + "learning_rate": 2.4361793870289028e-08, + "loss": 0.0066, + "num_tokens": 70379170.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3610817193984985, + "sampling/importance_sampling_ratio/mean": 0.9996642470359802, + "sampling/importance_sampling_ratio/min": 0.5478037595748901, + "sampling/sampling_logp_difference/max": 0.6018381118774414, + "sampling/sampling_logp_difference/mean": 0.013857526704668999, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 212.625, + "completions/mean_terminated_length": 212.625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.35404205322265625, + "epoch": 2.736519607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01642056073563973, + "kl": 0.028945980593562126, + "learning_rate": 2.4142622098356326e-08, + "loss": 0.0003, + "num_tokens": 70411066.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6273410320281982, + "sampling/importance_sampling_ratio/mean": 1.0002670288085938, + "sampling/importance_sampling_ratio/min": 0.6700373888015747, + "sampling/sampling_logp_difference/max": 0.4869474172592163, + "sampling/sampling_logp_difference/mean": 0.014044288545846939, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 203.359375, + "completions/mean_terminated_length": 203.359375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3403061032295227, + "epoch": 2.7377450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016057319750129643, + "kl": 0.023418106138706207, + "learning_rate": 2.3924416286754345e-08, + "loss": 0.0002, + "num_tokens": 70439361.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 1.000166416168213, + "sampling/importance_sampling_ratio/min": 0.6251910924911499, + "sampling/sampling_logp_difference/max": 0.48689985275268555, + "sampling/sampling_logp_difference/mean": 0.014232734218239784, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 233.328125, + "completions/mean_terminated_length": 233.328125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.2991639971733093, + "epoch": 2.7389705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012331143078647135, + "kl": 0.01602299138903618, + "learning_rate": 2.3707176878426882e-08, + "loss": 0.0002, + "num_tokens": 70472294.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.436407446861267, + "sampling/importance_sampling_ratio/mean": 0.9996287822723389, + "sampling/importance_sampling_ratio/min": 0.3998073935508728, + "sampling/sampling_logp_difference/max": 0.9167723655700684, + "sampling/sampling_logp_difference/mean": 0.01243562251329422, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 170.71875, + "completions/mean_terminated_length": 170.71875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.34510964155197144, + "epoch": 2.7401960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015965299142909544, + "kl": 0.02148594707250595, + "learning_rate": 2.3490904314356407e-08, + "loss": 0.0002, + "num_tokens": 70500948.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4352436065673828, + "sampling/importance_sampling_ratio/mean": 0.9997973442077637, + "sampling/importance_sampling_ratio/min": 0.6628729701042175, + "sampling/sampling_logp_difference/max": 0.41117191314697266, + "sampling/sampling_logp_difference/mean": 0.01357237994670868, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 208.71875, + "completions/mean_terminated_length": 208.71875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3250342309474945, + "epoch": 2.741421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013885240972677616, + "kl": 0.01940862461924553, + "learning_rate": 2.327559903356241e-08, + "loss": 0.0002, + "num_tokens": 70539666.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4104986190795898, + "sampling/importance_sampling_ratio/mean": 1.0001335144042969, + "sampling/importance_sampling_ratio/min": 0.6416339874267578, + "sampling/sampling_logp_difference/max": 0.443737268447876, + "sampling/sampling_logp_difference/mean": 0.012945730239152908, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 198.640625, + "completions/mean_terminated_length": 198.640625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.33332884311676025, + "epoch": 2.7426470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012336247457790443, + "kl": 0.02224317193031311, + "learning_rate": 2.3061261473101002e-08, + "loss": 0.0002, + "num_tokens": 70574011.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.618847131729126, + "sampling/importance_sampling_ratio/mean": 0.9995735883712769, + "sampling/importance_sampling_ratio/min": 0.6375756859779358, + "sampling/sampling_logp_difference/max": 0.48171424865722656, + "sampling/sampling_logp_difference/mean": 0.01353538315743208, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 173.71875, + "completions/mean_terminated_length": 173.71875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4061875641345978, + "epoch": 2.743872549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7488505424053589, + "kl": 0.03188773989677429, + "learning_rate": 2.2847892068063755e-08, + "loss": 0.0138, + "num_tokens": 70606857.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6663976907730103, + "sampling/importance_sampling_ratio/mean": 1.0003501176834106, + "sampling/importance_sampling_ratio/min": 0.6072332859039307, + "sampling/sampling_logp_difference/max": 0.5106642246246338, + "sampling/sampling_logp_difference/mean": 0.017464924603700638, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 200.40625, + "completions/mean_terminated_length": 200.40625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4441275894641876, + "epoch": 2.7450980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0628466768420897, + "kl": 0.03030751273036003, + "learning_rate": 2.263549125157721e-08, + "loss": 0.0251, + "num_tokens": 70640419.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5233913660049438, + "sampling/importance_sampling_ratio/mean": 1.000167965888977, + "sampling/importance_sampling_ratio/min": 0.7082679271697998, + "sampling/sampling_logp_difference/max": 0.42093896865844727, + "sampling/sampling_logp_difference/mean": 0.015648623928427696, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 180.75, + "completions/mean_terminated_length": 180.75, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.386197566986084, + "epoch": 2.7463235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8935470667623276, + "kl": 0.026635531336069107, + "learning_rate": 2.242405945480147e-08, + "loss": 0.0007, + "num_tokens": 70668179.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3671354055404663, + "sampling/importance_sampling_ratio/mean": 1.0001649856567383, + "sampling/importance_sampling_ratio/min": 0.7245340347290039, + "sampling/sampling_logp_difference/max": 0.32222652435302734, + "sampling/sampling_logp_difference/mean": 0.015021894127130508, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 195.84375, + "completions/mean_terminated_length": 195.84375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3563383221626282, + "epoch": 2.747549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013005726344480849, + "kl": 0.019569067284464836, + "learning_rate": 2.2213597106929605e-08, + "loss": 0.0002, + "num_tokens": 70703049.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5812333822250366, + "sampling/importance_sampling_ratio/mean": 1.0001039505004883, + "sampling/importance_sampling_ratio/min": 0.6924002170562744, + "sampling/sampling_logp_difference/max": 0.4582052230834961, + "sampling/sampling_logp_difference/mean": 0.014485219493508339, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 185.890625, + "completions/mean_terminated_length": 185.890625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.35772231221199036, + "epoch": 2.748774509803922, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.287825080029086, + "kl": 0.03768147900700569, + "learning_rate": 2.200410463518704e-08, + "loss": 0.0004, + "num_tokens": 70730034.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3495383262634277, + "sampling/importance_sampling_ratio/mean": 0.9996664524078369, + "sampling/importance_sampling_ratio/min": 0.6629447340965271, + "sampling/sampling_logp_difference/max": 0.41106367111206055, + "sampling/sampling_logp_difference/mean": 0.014889972284436226, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 238.703125, + "completions/mean_terminated_length": 238.703125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.3390355706214905, + "epoch": 2.75, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7348305087159989, + "kl": 0.026839446276426315, + "learning_rate": 2.1795582464830153e-08, + "loss": -0.0189, + "num_tokens": 70760751.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.3670462369918823, + "sampling/importance_sampling_ratio/mean": 1.0002970695495605, + "sampling/importance_sampling_ratio/min": 0.6226705312728882, + "sampling/sampling_logp_difference/max": 0.4737377166748047, + "sampling/sampling_logp_difference/mean": 0.013434633612632751, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 158.125, + "completions/mean_terminated_length": 158.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.2933042645454407, + "epoch": 2.751225490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.023499611836145, + "kl": 0.05593154579401016, + "learning_rate": 2.1588031019145636e-08, + "loss": 0.0023, + "num_tokens": 70786807.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.374788522720337, + "sampling/importance_sampling_ratio/mean": 1.0001814365386963, + "sampling/importance_sampling_ratio/min": 0.6192825436592102, + "sampling/sampling_logp_difference/max": 0.47919368743896484, + "sampling/sampling_logp_difference/mean": 0.012308412231504917, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 214.828125, + "completions/mean_terminated_length": 214.828125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3517012596130371, + "epoch": 2.752450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012697852038452596, + "kl": 0.019416378811001778, + "learning_rate": 2.13814507194498e-08, + "loss": 0.0002, + "num_tokens": 70817516.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.44302237033844, + "sampling/importance_sampling_ratio/mean": 1.000140905380249, + "sampling/importance_sampling_ratio/min": 0.6577356457710266, + "sampling/sampling_logp_difference/max": 0.41895222663879395, + "sampling/sampling_logp_difference/mean": 0.01361516211181879, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 216.109375, + "completions/mean_terminated_length": 216.109375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.5082956552505493, + "epoch": 2.7536764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9445501706534202, + "kl": 0.055819157510995865, + "learning_rate": 2.1175841985087707e-08, + "loss": -0.0334, + "num_tokens": 70852131.0, + "reward": -0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.6112245321273804, + "sampling/importance_sampling_ratio/mean": 1.000665545463562, + "sampling/importance_sampling_ratio/min": 0.618553876876831, + "sampling/sampling_logp_difference/max": 0.48037099838256836, + "sampling/sampling_logp_difference/mean": 0.018055545166134834, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 214.78125, + "completions/mean_terminated_length": 214.78125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.37545245885849, + "epoch": 2.7549019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0201676838857112, + "kl": 0.024367734789848328, + "learning_rate": 2.097120523343199e-08, + "loss": 0.0087, + "num_tokens": 70884005.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5336076021194458, + "sampling/importance_sampling_ratio/mean": 0.9998330473899841, + "sampling/importance_sampling_ratio/min": 0.6409609913825989, + "sampling/sampling_logp_difference/max": 0.4447866678237915, + "sampling/sampling_logp_difference/mean": 0.014355825260281563, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 161.359375, + "completions/mean_terminated_length": 161.359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.42221778631210327, + "epoch": 2.756127450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9751308068647037, + "kl": 0.047976039350032806, + "learning_rate": 2.076754087988214e-08, + "loss": 0.0062, + "num_tokens": 70909356.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.541296362876892, + "sampling/importance_sampling_ratio/mean": 0.9989932775497437, + "sampling/importance_sampling_ratio/min": 0.6262674927711487, + "sampling/sampling_logp_difference/max": 0.46797776222229004, + "sampling/sampling_logp_difference/mean": 0.016862986609339714, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 216.328125, + "completions/mean_terminated_length": 216.328125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.28034213185310364, + "epoch": 2.7573529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0129235382913119, + "kl": 0.016742991283535957, + "learning_rate": 2.0564849337864122e-08, + "loss": 0.0002, + "num_tokens": 70940945.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5344040393829346, + "sampling/importance_sampling_ratio/mean": 0.9997872114181519, + "sampling/importance_sampling_ratio/min": 0.6181705594062805, + "sampling/sampling_logp_difference/max": 0.4809908866882324, + "sampling/sampling_logp_difference/mean": 0.011822382919490337, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 181.875, + "completions/mean_terminated_length": 181.875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3743704557418823, + "epoch": 2.758578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020817038768702053, + "kl": 0.026140248402953148, + "learning_rate": 2.036313101882875e-08, + "loss": 0.0003, + "num_tokens": 70977385.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.324415922164917, + "sampling/importance_sampling_ratio/mean": 1.0000091791152954, + "sampling/importance_sampling_ratio/min": 0.6054502129554749, + "sampling/sampling_logp_difference/max": 0.5017828941345215, + "sampling/sampling_logp_difference/mean": 0.01400675904005766, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 209.3125, + "completions/mean_terminated_length": 209.3125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.43410447239875793, + "epoch": 2.7598039215686274, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1545171007395325, + "kl": 0.044753510504961014, + "learning_rate": 2.0162386332251648e-08, + "loss": -0.0003, + "num_tokens": 71011165.0, + "reward": 0.125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.521191120147705, + "sampling/importance_sampling_ratio/mean": 0.9995642900466919, + "sampling/importance_sampling_ratio/min": 0.6394612789154053, + "sampling/sampling_logp_difference/max": 0.4471292495727539, + "sampling/sampling_logp_difference/mean": 0.01565314084291458, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 224.984375, + "completions/mean_terminated_length": 224.984375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4113616645336151, + "epoch": 2.7610294117647056, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1315484898630275, + "kl": 0.03205491602420807, + "learning_rate": 1.9962615685631568e-08, + "loss": 0.031, + "num_tokens": 71042924.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4372361898422241, + "sampling/importance_sampling_ratio/mean": 1.000169277191162, + "sampling/importance_sampling_ratio/min": 0.6896570324897766, + "sampling/sampling_logp_difference/max": 0.3715609312057495, + "sampling/sampling_logp_difference/mean": 0.014121998101472855, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 198.265625, + "completions/mean_terminated_length": 198.265625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3452353775501251, + "epoch": 2.7622549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037089866348583675, + "kl": 0.03471102565526962, + "learning_rate": 1.976381948449035e-08, + "loss": 0.0004, + "num_tokens": 71079693.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6159518957138062, + "sampling/importance_sampling_ratio/mean": 0.9987235069274902, + "sampling/importance_sampling_ratio/min": 0.6117444038391113, + "sampling/sampling_logp_difference/max": 0.4914407730102539, + "sampling/sampling_logp_difference/mean": 0.014984086155891418, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 191.609375, + "completions/mean_terminated_length": 191.609375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4518758952617645, + "epoch": 2.763480392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030282372411408555, + "kl": 0.06672383844852448, + "learning_rate": 1.9565998132371808e-08, + "loss": 0.0007, + "num_tokens": 71114324.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5612155199050903, + "sampling/importance_sampling_ratio/mean": 1.0006022453308105, + "sampling/importance_sampling_ratio/min": 0.6825100779533386, + "sampling/sampling_logp_difference/max": 0.44546473026275635, + "sampling/sampling_logp_difference/mean": 0.01747298426926136, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 242.953125, + "completions/mean_terminated_length": 242.953125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.3945736587047577, + "epoch": 2.764705882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.787144812149263, + "kl": 0.03934440016746521, + "learning_rate": 1.936915203084055e-08, + "loss": 0.0112, + "num_tokens": 71149137.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.582056999206543, + "sampling/importance_sampling_ratio/mean": 1.000464916229248, + "sampling/importance_sampling_ratio/min": 0.6629458665847778, + "sampling/sampling_logp_difference/max": 0.4587259292602539, + "sampling/sampling_logp_difference/mean": 0.015069615095853806, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 206.421875, + "completions/mean_terminated_length": 206.421875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.39079004526138306, + "epoch": 2.7659313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08122842759486407, + "kl": 0.04704172909259796, + "learning_rate": 1.9173281579481894e-08, + "loss": 0.0005, + "num_tokens": 71179676.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5323126316070557, + "sampling/importance_sampling_ratio/mean": 1.0000157356262207, + "sampling/importance_sampling_ratio/min": 0.6178263425827026, + "sampling/sampling_logp_difference/max": 0.48154783248901367, + "sampling/sampling_logp_difference/mean": 0.014137955382466316, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 189.390625, + "completions/mean_terminated_length": 189.390625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.4201244115829468, + "epoch": 2.767156862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9584494048322517, + "kl": 0.03773742541670799, + "learning_rate": 1.897838717590028e-08, + "loss": 0.0236, + "num_tokens": 71214613.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4423161745071411, + "sampling/importance_sampling_ratio/mean": 0.9996387958526611, + "sampling/importance_sampling_ratio/min": 0.5998695492744446, + "sampling/sampling_logp_difference/max": 0.5110430717468262, + "sampling/sampling_logp_difference/mean": 0.015818282961845398, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 189.671875, + "completions/mean_terminated_length": 189.671875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3822367191314697, + "epoch": 2.7683823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01981045146895738, + "kl": 0.02560245618224144, + "learning_rate": 1.8784469215719077e-08, + "loss": 0.0003, + "num_tokens": 71245856.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4124040603637695, + "sampling/importance_sampling_ratio/mean": 1.0001107454299927, + "sampling/importance_sampling_ratio/min": 0.7300852537155151, + "sampling/sampling_logp_difference/max": 0.3452932834625244, + "sampling/sampling_logp_difference/mean": 0.015097684226930141, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 222.78125, + "completions/mean_terminated_length": 222.78125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.38841933012008667, + "epoch": 2.769607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7455509339217324, + "kl": 0.036847107112407684, + "learning_rate": 1.8591528092579524e-08, + "loss": -0.0152, + "num_tokens": 71277522.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.47059965133667, + "sampling/importance_sampling_ratio/mean": 0.9993435144424438, + "sampling/importance_sampling_ratio/min": 0.6143800616264343, + "sampling/sampling_logp_difference/max": 0.48714160919189453, + "sampling/sampling_logp_difference/mean": 0.014544347301125526, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 214.375, + "completions/mean_terminated_length": 214.375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.43171802163124084, + "epoch": 2.7708333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018514479874583628, + "kl": 0.03312084823846817, + "learning_rate": 1.8399564198139707e-08, + "loss": 0.0003, + "num_tokens": 71313386.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6046829223632812, + "sampling/importance_sampling_ratio/mean": 1.0005278587341309, + "sampling/importance_sampling_ratio/min": 0.6150000691413879, + "sampling/sampling_logp_difference/max": 0.4861328601837158, + "sampling/sampling_logp_difference/mean": 0.015385524369776249, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 203.5625, + "completions/mean_terminated_length": 203.5625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.28259339928627014, + "epoch": 2.7720588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013630810725824267, + "kl": 0.02327803522348404, + "learning_rate": 1.8208577922074308e-08, + "loss": 0.0002, + "num_tokens": 71342894.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3212229013442993, + "sampling/importance_sampling_ratio/mean": 1.0000630617141724, + "sampling/importance_sampling_ratio/min": 0.6086598634719849, + "sampling/sampling_logp_difference/max": 0.49649572372436523, + "sampling/sampling_logp_difference/mean": 0.01166549976915121, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 198.34375, + "completions/mean_terminated_length": 198.34375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3560211956501007, + "epoch": 2.7732843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8075662995223074, + "kl": 0.024799000471830368, + "learning_rate": 1.8018569652073378e-08, + "loss": -0.034, + "num_tokens": 71377780.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5474638938903809, + "sampling/importance_sampling_ratio/mean": 0.9999863505363464, + "sampling/importance_sampling_ratio/min": 0.688747227191925, + "sampling/sampling_logp_difference/max": 0.436617374420166, + "sampling/sampling_logp_difference/mean": 0.014782153069972992, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 173.390625, + "completions/mean_terminated_length": 173.390625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3286271095275879, + "epoch": 2.7745098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016444057765425384, + "kl": 0.01911253109574318, + "learning_rate": 1.7829539773841608e-08, + "loss": 0.0002, + "num_tokens": 71403821.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.409652829170227, + "sampling/importance_sampling_ratio/mean": 1.0003719329833984, + "sampling/importance_sampling_ratio/min": 0.4979653060436249, + "sampling/sampling_logp_difference/max": 0.6972248554229736, + "sampling/sampling_logp_difference/mean": 0.013749302364885807, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 270.46875, + "completions/mean_terminated_length": 270.46875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.41420799493789673, + "epoch": 2.775735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7681025216946435, + "kl": 0.023575209081172943, + "learning_rate": 1.7641488671097606e-08, + "loss": 0.0032, + "num_tokens": 71439979.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.5751935243606567, + "sampling/importance_sampling_ratio/mean": 1.0003597736358643, + "sampling/importance_sampling_ratio/min": 0.6245823502540588, + "sampling/sampling_logp_difference/max": 0.4706721305847168, + "sampling/sampling_logp_difference/mean": 0.014989707618951797, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 196.203125, + "completions/mean_terminated_length": 196.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3639991581439972, + "epoch": 2.7769607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9776378086283738, + "kl": 0.027560634538531303, + "learning_rate": 1.745441672557335e-08, + "loss": -0.0501, + "num_tokens": 71469320.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.8810596466064453, + "sampling/importance_sampling_ratio/mean": 0.9995264410972595, + "sampling/importance_sampling_ratio/min": 0.6577244997024536, + "sampling/sampling_logp_difference/max": 0.6318352222442627, + "sampling/sampling_logp_difference/mean": 0.014080442488193512, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 200.5625, + "completions/mean_terminated_length": 200.5625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3483728766441345, + "epoch": 2.778186274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7482550607243168, + "kl": 0.02732473611831665, + "learning_rate": 1.7268324317012973e-08, + "loss": -0.0, + "num_tokens": 71505644.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5977916717529297, + "sampling/importance_sampling_ratio/mean": 1.0005908012390137, + "sampling/importance_sampling_ratio/min": 0.6355277299880981, + "sampling/sampling_logp_difference/max": 0.46862244606018066, + "sampling/sampling_logp_difference/mean": 0.01260870136320591, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 205.875, + "completions/mean_terminated_length": 205.875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3717099130153656, + "epoch": 2.7794117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01416227546876349, + "kl": 0.030278926715254784, + "learning_rate": 1.7083211823172184e-08, + "loss": 0.0003, + "num_tokens": 71548436.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6404244899749756, + "sampling/importance_sampling_ratio/mean": 0.9999901056289673, + "sampling/importance_sampling_ratio/min": 0.6045262217521667, + "sampling/sampling_logp_difference/max": 0.5033102035522461, + "sampling/sampling_logp_difference/mean": 0.014114852994680405, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 161.03125, + "completions/mean_terminated_length": 161.03125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3516573905944824, + "epoch": 2.780637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019098955354129955, + "kl": 0.026305729523301125, + "learning_rate": 1.6899079619817792e-08, + "loss": 0.0003, + "num_tokens": 71579350.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4534450769424438, + "sampling/importance_sampling_ratio/mean": 0.9997409582138062, + "sampling/importance_sampling_ratio/min": 0.5638967752456665, + "sampling/sampling_logp_difference/max": 0.5728840827941895, + "sampling/sampling_logp_difference/mean": 0.015011224895715714, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 202.921875, + "completions/mean_terminated_length": 202.921875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3277509808540344, + "epoch": 2.781862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011914898594531468, + "kl": 0.016980361193418503, + "learning_rate": 1.6715928080726415e-08, + "loss": 0.0002, + "num_tokens": 71606769.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4952607154846191, + "sampling/importance_sampling_ratio/mean": 0.9994877576828003, + "sampling/importance_sampling_ratio/min": 0.6171972751617432, + "sampling/sampling_logp_difference/max": 0.48256659507751465, + "sampling/sampling_logp_difference/mean": 0.013675330206751823, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 229.046875, + "completions/mean_terminated_length": 229.046875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.33403104543685913, + "epoch": 2.7830882352941178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014551090533169688, + "kl": 0.022232726216316223, + "learning_rate": 1.653375757768405e-08, + "loss": 0.0002, + "num_tokens": 71643844.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5311309099197388, + "sampling/importance_sampling_ratio/mean": 0.9995133280754089, + "sampling/importance_sampling_ratio/min": 0.6396133303642273, + "sampling/sampling_logp_difference/max": 0.4468914866447449, + "sampling/sampling_logp_difference/mean": 0.013440671376883984, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 219.6875, + "completions/mean_terminated_length": 219.6875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.34039393067359924, + "epoch": 2.784313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02489054595610817, + "kl": 0.030884843319654465, + "learning_rate": 1.6352568480485275e-08, + "loss": 0.0003, + "num_tokens": 71677360.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5885577201843262, + "sampling/importance_sampling_ratio/mean": 0.9998325705528259, + "sampling/importance_sampling_ratio/min": 0.6298431158065796, + "sampling/sampling_logp_difference/max": 0.4628264904022217, + "sampling/sampling_logp_difference/mean": 0.014835665933787823, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 227.09375, + "completions/mean_terminated_length": 227.09375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.41055136919021606, + "epoch": 2.7855392156862746, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022139890170848206, + "kl": 0.022946510463953018, + "learning_rate": 1.6172361156932547e-08, + "loss": 0.0002, + "num_tokens": 71711094.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004057884216309, + "sampling/importance_sampling_ratio/min": 0.5827050805091858, + "sampling/sampling_logp_difference/max": 0.7382068634033203, + "sampling/sampling_logp_difference/mean": 0.015440518967807293, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 185.46875, + "completions/mean_terminated_length": 185.46875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3047197163105011, + "epoch": 2.786764705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017871062297091952, + "kl": 0.024374118074774742, + "learning_rate": 1.5993135972835303e-08, + "loss": 0.0002, + "num_tokens": 71736532.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4740394353866577, + "sampling/importance_sampling_ratio/mean": 0.999732255935669, + "sampling/importance_sampling_ratio/min": 0.36422932147979736, + "sampling/sampling_logp_difference/max": 1.0099716186523438, + "sampling/sampling_logp_difference/mean": 0.013879004865884781, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 191.78125, + "completions/mean_terminated_length": 191.78125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3157747983932495, + "epoch": 2.7879901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01774562023969574, + "kl": 0.021239880472421646, + "learning_rate": 1.581489329200919e-08, + "loss": 0.0002, + "num_tokens": 71766326.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 0.9995769262313843, + "sampling/importance_sampling_ratio/min": 0.6960172653198242, + "sampling/sampling_logp_difference/max": 0.49199533462524414, + "sampling/sampling_logp_difference/mean": 0.013254116289317608, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 200.3125, + "completions/mean_terminated_length": 200.3125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3812054991722107, + "epoch": 2.7892156862745097, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.760931902743617, + "kl": 0.10009460896253586, + "learning_rate": 1.5637633476275724e-08, + "loss": -0.0047, + "num_tokens": 71795178.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.2994608879089355, + "sampling/importance_sampling_ratio/mean": 1.0003132820129395, + "sampling/importance_sampling_ratio/min": 0.62801194190979, + "sampling/sampling_logp_difference/max": 0.4651961326599121, + "sampling/sampling_logp_difference/mean": 0.015429170802235603, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 186.578125, + "completions/mean_terminated_length": 186.578125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.2862929105758667, + "epoch": 2.7904411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7032311059803306, + "kl": 0.02835479937493801, + "learning_rate": 1.5461356885461075e-08, + "loss": 0.0011, + "num_tokens": 71821439.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.590358853340149, + "sampling/importance_sampling_ratio/mean": 0.9992215037345886, + "sampling/importance_sampling_ratio/min": 0.6701014041900635, + "sampling/sampling_logp_difference/max": 0.4639596939086914, + "sampling/sampling_logp_difference/mean": 0.011490806937217712, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 251.4375, + "completions/mean_terminated_length": 251.4375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.4205988347530365, + "epoch": 2.7916666666666665, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6352273994365715, + "kl": 0.04728353023529053, + "learning_rate": 1.528606387739545e-08, + "loss": 0.0001, + "num_tokens": 71862955.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.7753299474716187, + "sampling/importance_sampling_ratio/mean": 0.9998862147331238, + "sampling/importance_sampling_ratio/min": 0.6273989677429199, + "sampling/sampling_logp_difference/max": 0.5739863514900208, + "sampling/sampling_logp_difference/mean": 0.014236108399927616, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 216.28125, + "completions/mean_terminated_length": 216.28125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3384827673435211, + "epoch": 2.792892156862745, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0139936031800226, + "kl": 0.021461669355630875, + "learning_rate": 1.5111754807912546e-08, + "loss": 0.0002, + "num_tokens": 71892045.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5768814086914062, + "sampling/importance_sampling_ratio/mean": 1.000150203704834, + "sampling/importance_sampling_ratio/min": 0.6033788919448853, + "sampling/sampling_logp_difference/max": 0.5052099227905273, + "sampling/sampling_logp_difference/mean": 0.014072628691792488, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 206.703125, + "completions/mean_terminated_length": 206.703125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.26644182205200195, + "epoch": 2.7941176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013526680091452808, + "kl": 0.016012419015169144, + "learning_rate": 1.493843003084888e-08, + "loss": 0.0002, + "num_tokens": 71927162.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.403918981552124, + "sampling/importance_sampling_ratio/mean": 0.9997563362121582, + "sampling/importance_sampling_ratio/min": 0.620281457901001, + "sampling/sampling_logp_difference/max": 0.4775819778442383, + "sampling/sampling_logp_difference/mean": 0.011524304747581482, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 163.125, + "completions/mean_terminated_length": 163.125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.34191954135894775, + "epoch": 2.795343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02604340860845786, + "kl": 0.029353775084018707, + "learning_rate": 1.4766089898042677e-08, + "loss": 0.0003, + "num_tokens": 71953890.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9246152639389038, + "sampling/importance_sampling_ratio/mean": 1.0005629062652588, + "sampling/importance_sampling_ratio/min": 0.6502727270126343, + "sampling/sampling_logp_difference/max": 0.6547261476516724, + "sampling/sampling_logp_difference/mean": 0.014699749648571014, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 177.5, + "completions/mean_terminated_length": 177.5, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.3046913743019104, + "epoch": 2.7965686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014346661295387824, + "kl": 0.018736181780695915, + "learning_rate": 1.4594734759333482e-08, + "loss": 0.0002, + "num_tokens": 71983954.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6348410844802856, + "sampling/importance_sampling_ratio/mean": 1.0004372596740723, + "sampling/importance_sampling_ratio/min": 0.6498619914054871, + "sampling/sampling_logp_difference/max": 0.4915456771850586, + "sampling/sampling_logp_difference/mean": 0.013584461063146591, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 166.15625, + "completions/mean_terminated_length": 166.15625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3658546507358551, + "epoch": 2.797794117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015578315080727533, + "kl": 0.022142380475997925, + "learning_rate": 1.4424364962561386e-08, + "loss": 0.0002, + "num_tokens": 72012908.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4352006912231445, + "sampling/importance_sampling_ratio/mean": 1.000298261642456, + "sampling/importance_sampling_ratio/min": 0.6274946928024292, + "sampling/sampling_logp_difference/max": 0.4660201072692871, + "sampling/sampling_logp_difference/mean": 0.014988088980317116, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 172.84375, + "completions/mean_terminated_length": 172.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.39948737621307373, + "epoch": 2.799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9562705683755706, + "kl": 0.09000791609287262, + "learning_rate": 1.4254980853566246e-08, + "loss": 0.0205, + "num_tokens": 72042242.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5422638654708862, + "sampling/importance_sampling_ratio/mean": 0.9992375373840332, + "sampling/importance_sampling_ratio/min": 0.6525204181671143, + "sampling/sampling_logp_difference/max": 0.43325138092041016, + "sampling/sampling_logp_difference/mean": 0.0161836426705122, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 170.71875, + "completions/mean_terminated_length": 170.71875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3471698760986328, + "epoch": 2.8002450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021503997632601194, + "kl": 0.028218526393175125, + "learning_rate": 1.4086582776187239e-08, + "loss": 0.0003, + "num_tokens": 72073456.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4221441745758057, + "sampling/importance_sampling_ratio/mean": 1.0006734132766724, + "sampling/importance_sampling_ratio/min": 0.6298839449882507, + "sampling/sampling_logp_difference/max": 0.4622197151184082, + "sampling/sampling_logp_difference/mean": 0.015107318758964539, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 228.875, + "completions/mean_terminated_length": 228.875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.44408896565437317, + "epoch": 2.8014705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7316867389262184, + "kl": 0.028132587671279907, + "learning_rate": 1.3919171072261537e-08, + "loss": -0.0022, + "num_tokens": 72109064.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5388634204864502, + "sampling/importance_sampling_ratio/mean": 1.0005836486816406, + "sampling/importance_sampling_ratio/min": 0.6987859010696411, + "sampling/sampling_logp_difference/max": 0.4310441017150879, + "sampling/sampling_logp_difference/mean": 0.015850011259317398, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 225.75, + "completions/mean_terminated_length": 225.75, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3989129066467285, + "epoch": 2.8026960784313726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014688646078152737, + "kl": 0.020441781729459763, + "learning_rate": 1.3752746081624467e-08, + "loss": 0.0002, + "num_tokens": 72143688.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4659093618392944, + "sampling/importance_sampling_ratio/mean": 0.9998365640640259, + "sampling/importance_sampling_ratio/min": 0.6817200183868408, + "sampling/sampling_logp_difference/max": 0.3831362724304199, + "sampling/sampling_logp_difference/mean": 0.014877324923872948, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 197.734375, + "completions/mean_terminated_length": 197.734375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.31073617935180664, + "epoch": 2.803921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012405867713131104, + "kl": 0.019323110580444336, + "learning_rate": 1.3587308142108178e-08, + "loss": 0.0002, + "num_tokens": 72174919.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4327930212020874, + "sampling/importance_sampling_ratio/mean": 1.0001391172409058, + "sampling/importance_sampling_ratio/min": 0.6468679308891296, + "sampling/sampling_logp_difference/max": 0.43561315536499023, + "sampling/sampling_logp_difference/mean": 0.013890000060200691, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 175.15625, + "completions/mean_terminated_length": 175.15625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3362789750099182, + "epoch": 2.8051470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017843234505621534, + "kl": 0.024533363059163094, + "learning_rate": 1.3422857589541148e-08, + "loss": 0.0002, + "num_tokens": 72200481.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.433450698852539, + "sampling/importance_sampling_ratio/mean": 1.0008256435394287, + "sampling/importance_sampling_ratio/min": 0.650992751121521, + "sampling/sampling_logp_difference/max": 0.429256796836853, + "sampling/sampling_logp_difference/mean": 0.014599323272705078, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 209.5, + "completions/mean_terminated_length": 209.5, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.36659950017929077, + "epoch": 2.806372549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7103152574303739, + "kl": 0.06570275127887726, + "learning_rate": 1.3259394757747677e-08, + "loss": 0.0083, + "num_tokens": 72228033.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5661461353302002, + "sampling/importance_sampling_ratio/mean": 0.9998388290405273, + "sampling/importance_sampling_ratio/min": 0.6300721168518066, + "sampling/sampling_logp_difference/max": 0.46192097663879395, + "sampling/sampling_logp_difference/mean": 0.015525387600064278, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 228.828125, + "completions/mean_terminated_length": 228.828125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3071904182434082, + "epoch": 2.8075980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016346678045750535, + "kl": 0.020016958937048912, + "learning_rate": 1.3096919978546838e-08, + "loss": 0.0002, + "num_tokens": 72257862.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003342628479004, + "sampling/importance_sampling_ratio/min": 0.6487526297569275, + "sampling/sampling_logp_difference/max": 0.9694099426269531, + "sampling/sampling_logp_difference/mean": 0.013793135061860085, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 200.453125, + "completions/mean_terminated_length": 200.453125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3286440074443817, + "epoch": 2.8088235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013722676646069513, + "kl": 0.019434725865721703, + "learning_rate": 1.2935433581752365e-08, + "loss": 0.0002, + "num_tokens": 72287827.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.533599853515625, + "sampling/importance_sampling_ratio/mean": 0.9998020529747009, + "sampling/importance_sampling_ratio/min": 0.6816995143890381, + "sampling/sampling_logp_difference/max": 0.42761778831481934, + "sampling/sampling_logp_difference/mean": 0.013315289281308651, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 256.859375, + "completions/mean_terminated_length": 256.859375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3069850206375122, + "epoch": 2.810049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013881546272108672, + "kl": 0.017193015664815903, + "learning_rate": 1.2774935895171091e-08, + "loss": 0.0002, + "num_tokens": 72320202.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5744256973266602, + "sampling/importance_sampling_ratio/mean": 1.0000576972961426, + "sampling/importance_sampling_ratio/min": 0.6482973694801331, + "sampling/sampling_logp_difference/max": 0.4538905620574951, + "sampling/sampling_logp_difference/mean": 0.011956065893173218, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 200.953125, + "completions/mean_terminated_length": 200.953125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.25608932971954346, + "epoch": 2.811274509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016591961813083148, + "kl": 0.025756172835826874, + "learning_rate": 1.2615427244603405e-08, + "loss": 0.0003, + "num_tokens": 72347927.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5533801317214966, + "sampling/importance_sampling_ratio/mean": 1.000268578529358, + "sampling/importance_sampling_ratio/min": 0.6556463241577148, + "sampling/sampling_logp_difference/max": 0.4404332637786865, + "sampling/sampling_logp_difference/mean": 0.010999459773302078, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 218.984375, + "completions/mean_terminated_length": 218.984375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.3019634187221527, + "epoch": 2.8125, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01433802192062887, + "kl": 0.019851867109537125, + "learning_rate": 1.2456907953841633e-08, + "loss": 0.0002, + "num_tokens": 72378054.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.405820369720459, + "sampling/importance_sampling_ratio/mean": 0.9999945163726807, + "sampling/importance_sampling_ratio/min": 0.6096331477165222, + "sampling/sampling_logp_difference/max": 0.49489784240722656, + "sampling/sampling_logp_difference/mean": 0.012230746448040009, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 214.59375, + "completions/mean_terminated_length": 214.59375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3920062482357025, + "epoch": 2.813725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1001250463806667, + "kl": 0.033399369567632675, + "learning_rate": 1.2299378344669986e-08, + "loss": -0.0648, + "num_tokens": 72406684.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4886319637298584, + "sampling/importance_sampling_ratio/mean": 0.9996265769004822, + "sampling/importance_sampling_ratio/min": 0.5238063335418701, + "sampling/sampling_logp_difference/max": 0.6466332674026489, + "sampling/sampling_logp_difference/mean": 0.014440304599702358, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 252.5625, + "completions/mean_terminated_length": 252.5625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.40310561656951904, + "epoch": 2.814950980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01572991801480214, + "kl": 0.023704007267951965, + "learning_rate": 1.2142838736863559e-08, + "loss": 0.0002, + "num_tokens": 72438160.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.608891487121582, + "sampling/importance_sampling_ratio/mean": 1.000126838684082, + "sampling/importance_sampling_ratio/min": 0.4627833664417267, + "sampling/sampling_logp_difference/max": 0.7704962491989136, + "sampling/sampling_logp_difference/mean": 0.015635818243026733, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 166.4375, + "completions/mean_terminated_length": 166.4375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.4271998405456543, + "epoch": 2.8161764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0289753184079062, + "kl": 0.0330132395029068, + "learning_rate": 1.1987289448187777e-08, + "loss": -0.0284, + "num_tokens": 72466684.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.5406886339187622, + "sampling/importance_sampling_ratio/mean": 0.9999655485153198, + "sampling/importance_sampling_ratio/min": 0.7300183773040771, + "sampling/sampling_logp_difference/max": 0.43222951889038086, + "sampling/sampling_logp_difference/mean": 0.01558359619230032, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 217.3125, + "completions/mean_terminated_length": 217.3125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.4008779525756836, + "epoch": 2.8174019607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027301807947582572, + "kl": 0.027768369764089584, + "learning_rate": 1.183273079439795e-08, + "loss": 0.0003, + "num_tokens": 72503008.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5744314193725586, + "sampling/importance_sampling_ratio/mean": 1.000143051147461, + "sampling/importance_sampling_ratio/min": 0.5038020610809326, + "sampling/sampling_logp_difference/max": 0.6855719089508057, + "sampling/sampling_logp_difference/mean": 0.015518108382821083, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 197.578125, + "completions/mean_terminated_length": 197.578125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.4736689329147339, + "epoch": 2.818627450980392, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9397093439566856, + "kl": 0.03798678144812584, + "learning_rate": 1.167916308923822e-08, + "loss": -0.0049, + "num_tokens": 72538037.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4995191097259521, + "sampling/importance_sampling_ratio/mean": 0.9994603395462036, + "sampling/importance_sampling_ratio/min": 0.6323956847190857, + "sampling/sampling_logp_difference/max": 0.4582400321960449, + "sampling/sampling_logp_difference/mean": 0.016999687999486923, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 232.640625, + "completions/mean_terminated_length": 232.640625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.27715107798576355, + "epoch": 2.8198529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03962575330413272, + "kl": 0.025751516222953796, + "learning_rate": 1.152658664444145e-08, + "loss": 0.0002, + "num_tokens": 72571502.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.434734582901001, + "sampling/importance_sampling_ratio/mean": 1.0000114440917969, + "sampling/importance_sampling_ratio/min": 0.6264632344245911, + "sampling/sampling_logp_difference/max": 0.4676651954650879, + "sampling/sampling_logp_difference/mean": 0.011388463899493217, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 219.28125, + "completions/mean_terminated_length": 219.28125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.4073230028152466, + "epoch": 2.821078431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0163658782529671, + "kl": 0.024018779397010803, + "learning_rate": 1.1375001769727999e-08, + "loss": 0.0002, + "num_tokens": 72604784.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6203665733337402, + "sampling/importance_sampling_ratio/mean": 0.9998045563697815, + "sampling/importance_sampling_ratio/min": 0.6656513810157776, + "sampling/sampling_logp_difference/max": 0.4826524257659912, + "sampling/sampling_logp_difference/mean": 0.015417106449604034, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 220.4375, + "completions/mean_terminated_length": 220.4375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.41577786207199097, + "epoch": 2.8223039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7588138821700442, + "kl": 0.047006770968437195, + "learning_rate": 1.1224408772805671e-08, + "loss": 0.0089, + "num_tokens": 72638092.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.9848963022232056, + "sampling/importance_sampling_ratio/mean": 0.9997138977050781, + "sampling/importance_sampling_ratio/min": 0.6137931942939758, + "sampling/sampling_logp_difference/max": 0.6855666637420654, + "sampling/sampling_logp_difference/mean": 0.015783226117491722, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 215.28125, + "completions/mean_terminated_length": 215.28125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.40591156482696533, + "epoch": 2.8235294117647056, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4637659212274494, + "kl": 0.08024704456329346, + "learning_rate": 1.1074807959368715e-08, + "loss": 0.043, + "num_tokens": 72668238.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.411345362663269, + "sampling/importance_sampling_ratio/mean": 1.0003767013549805, + "sampling/importance_sampling_ratio/min": 0.6954315900802612, + "sampling/sampling_logp_difference/max": 0.363222599029541, + "sampling/sampling_logp_difference/mean": 0.014792080037295818, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 210.0625, + "completions/mean_terminated_length": 210.0625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3982296884059906, + "epoch": 2.8247549019607843, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2458175002931993, + "kl": 0.027796974405646324, + "learning_rate": 1.0926199633097154e-08, + "loss": 0.0093, + "num_tokens": 72699442.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4560644626617432, + "sampling/importance_sampling_ratio/mean": 0.9998251795768738, + "sampling/importance_sampling_ratio/min": 0.5910095572471619, + "sampling/sampling_logp_difference/max": 0.5259230136871338, + "sampling/sampling_logp_difference/mean": 0.01536161731928587, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 174.234375, + "completions/mean_terminated_length": 174.234375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.42545390129089355, + "epoch": 2.825980392156863, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1544362220501005, + "kl": 0.04277203232049942, + "learning_rate": 1.0778584095656685e-08, + "loss": -0.0173, + "num_tokens": 72724449.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5674940347671509, + "sampling/importance_sampling_ratio/mean": 1.0002422332763672, + "sampling/importance_sampling_ratio/min": 0.6310010552406311, + "sampling/sampling_logp_difference/max": 0.4604477882385254, + "sampling/sampling_logp_difference/mean": 0.017049824818968773, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 234.296875, + "completions/mean_terminated_length": 234.296875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.502498209476471, + "epoch": 2.827205882352941, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0590844835191695, + "kl": 0.043628957122564316, + "learning_rate": 1.0631961646697384e-08, + "loss": 0.0025, + "num_tokens": 72762852.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.8157446384429932, + "sampling/importance_sampling_ratio/mean": 1.0002353191375732, + "sampling/importance_sampling_ratio/min": 0.6138596534729004, + "sampling/sampling_logp_difference/max": 0.5964956283569336, + "sampling/sampling_logp_difference/mean": 0.016772117465734482, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 232.6875, + "completions/mean_terminated_length": 232.6875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3923417329788208, + "epoch": 2.8284313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8466391252933121, + "kl": 0.029825102537870407, + "learning_rate": 1.0486332583853564e-08, + "loss": 0.0663, + "num_tokens": 72799488.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.555580496788025, + "sampling/importance_sampling_ratio/mean": 0.999732494354248, + "sampling/importance_sampling_ratio/min": 0.7178300619125366, + "sampling/sampling_logp_difference/max": 0.4418487548828125, + "sampling/sampling_logp_difference/mean": 0.013146606273949146, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 164.828125, + "completions/mean_terminated_length": 164.828125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3960435390472412, + "epoch": 2.829656862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02599589169385834, + "kl": 0.03480679541826248, + "learning_rate": 1.0341697202742971e-08, + "loss": 0.0004, + "num_tokens": 72826741.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4337533712387085, + "sampling/importance_sampling_ratio/mean": 0.9996919631958008, + "sampling/importance_sampling_ratio/min": 0.690007209777832, + "sampling/sampling_logp_difference/max": 0.37105321884155273, + "sampling/sampling_logp_difference/mean": 0.01564282365143299, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 226.875, + "completions/mean_terminated_length": 226.875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.37923213839530945, + "epoch": 2.8308823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018401309831405036, + "kl": 0.020857226103544235, + "learning_rate": 1.0198055796966253e-08, + "loss": 0.0002, + "num_tokens": 72866397.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5018757581710815, + "sampling/importance_sampling_ratio/mean": 1.0000402927398682, + "sampling/importance_sampling_ratio/min": 0.6099699139595032, + "sampling/sampling_logp_difference/max": 0.49434566497802734, + "sampling/sampling_logp_difference/mean": 0.014194940216839314, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 179.34375, + "completions/mean_terminated_length": 179.34375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4174288511276245, + "epoch": 2.832107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019770007295590857, + "kl": 0.024596508592367172, + "learning_rate": 1.0055408658106446e-08, + "loss": 0.0002, + "num_tokens": 72895619.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5415550470352173, + "sampling/importance_sampling_ratio/mean": 1.0004100799560547, + "sampling/importance_sampling_ratio/min": 0.6368833780288696, + "sampling/sampling_logp_difference/max": 0.4511687755584717, + "sampling/sampling_logp_difference/mean": 0.01626231148838997, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 212.96875, + "completions/mean_terminated_length": 212.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3599066138267517, + "epoch": 2.8333333333333335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04102920645319019, + "kl": 0.030512923374772072, + "learning_rate": 9.913756075728086e-09, + "loss": 0.0003, + "num_tokens": 72926705.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.00050950050354, + "sampling/importance_sampling_ratio/min": 0.3303156793117523, + "sampling/sampling_logp_difference/max": 1.1077064275741577, + "sampling/sampling_logp_difference/mean": 0.015415752306580544, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 214.46875, + "completions/mean_terminated_length": 214.46875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.41638386249542236, + "epoch": 2.8345588235294117, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.317533006833584, + "kl": 0.038041263818740845, + "learning_rate": 9.77309833737705e-09, + "loss": 0.0006, + "num_tokens": 72959503.0, + "reward": 0.0625, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4607181549072266, + "sampling/importance_sampling_ratio/mean": 1.000251293182373, + "sampling/importance_sampling_ratio/min": 0.6063253879547119, + "sampling/sampling_logp_difference/max": 0.5003385543823242, + "sampling/sampling_logp_difference/mean": 0.015667861327528954, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 224.671875, + "completions/mean_terminated_length": 224.671875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.3578198552131653, + "epoch": 2.8357843137254903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01726464198667003, + "kl": 0.024974340572953224, + "learning_rate": 9.633435728579553e-09, + "loss": 0.0002, + "num_tokens": 72999402.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5950409173965454, + "sampling/importance_sampling_ratio/mean": 0.9995691180229187, + "sampling/importance_sampling_ratio/min": 0.5801035165786743, + "sampling/sampling_logp_difference/max": 0.544548749923706, + "sampling/sampling_logp_difference/mean": 0.014314240776002407, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 179.203125, + "completions/mean_terminated_length": 179.203125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3301982879638672, + "epoch": 2.8370098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24249387395368324, + "kl": 0.030474865809082985, + "learning_rate": 9.494768532841868e-09, + "loss": 0.0003, + "num_tokens": 73026535.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4057986736297607, + "sampling/importance_sampling_ratio/mean": 0.999641478061676, + "sampling/importance_sampling_ratio/min": 0.01303944829851389, + "sampling/sampling_logp_difference/max": 4.339776039123535, + "sampling/sampling_logp_difference/mean": 0.014355506747961044, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 242.53125, + "completions/mean_terminated_length": 242.53125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.4580194354057312, + "epoch": 2.838235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022540072788519563, + "kl": 0.0541636198759079, + "learning_rate": 9.357097031649664e-09, + "loss": 0.0005, + "num_tokens": 73065625.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4006434679031372, + "sampling/importance_sampling_ratio/mean": 1.000105619430542, + "sampling/importance_sampling_ratio/min": 0.6771495342254639, + "sampling/sampling_logp_difference/max": 0.38986313343048096, + "sampling/sampling_logp_difference/mean": 0.01546635851264, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 231.734375, + "completions/mean_terminated_length": 231.734375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.4251118302345276, + "epoch": 2.8394607843137254, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7661100233961052, + "kl": 0.04458808898925781, + "learning_rate": 9.22042150446728e-09, + "loss": -0.0047, + "num_tokens": 73100952.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5140609741210938, + "sampling/importance_sampling_ratio/mean": 1.0003342628479004, + "sampling/importance_sampling_ratio/min": 0.6603867411613464, + "sampling/sampling_logp_difference/max": 0.4149296283721924, + "sampling/sampling_logp_difference/mean": 0.015295105054974556, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 173.84375, + "completions/mean_terminated_length": 173.84375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.29564523696899414, + "epoch": 2.840686274509804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9336704230419721, + "kl": 0.02050560712814331, + "learning_rate": 9.084742228737564e-09, + "loss": 0.0003, + "num_tokens": 73128046.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4100669622421265, + "sampling/importance_sampling_ratio/mean": 1.0000941753387451, + "sampling/importance_sampling_ratio/min": 0.6374108195304871, + "sampling/sampling_logp_difference/max": 0.45034098625183105, + "sampling/sampling_logp_difference/mean": 0.012831066735088825, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 215.828125, + "completions/mean_terminated_length": 215.828125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3264046311378479, + "epoch": 2.8419117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015784236326885567, + "kl": 0.022641684859991074, + "learning_rate": 8.95005947988059e-09, + "loss": 0.0002, + "num_tokens": 73162307.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3977603912353516, + "sampling/importance_sampling_ratio/mean": 0.9998221397399902, + "sampling/importance_sampling_ratio/min": 0.6920250058174133, + "sampling/sampling_logp_difference/max": 0.36813318729400635, + "sampling/sampling_logp_difference/mean": 0.012915970757603645, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 251.46875, + "completions/mean_terminated_length": 251.46875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.3470064401626587, + "epoch": 2.843137254901961, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8172689514942891, + "kl": 0.023917749524116516, + "learning_rate": 8.816373531293941e-09, + "loss": -0.03, + "num_tokens": 73206097.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4835994243621826, + "sampling/importance_sampling_ratio/mean": 1.0001312494277954, + "sampling/importance_sampling_ratio/min": 0.4880845844745636, + "sampling/sampling_logp_difference/max": 0.7172665596008301, + "sampling/sampling_logp_difference/mean": 0.013287542387843132, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 207.703125, + "completions/mean_terminated_length": 207.703125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.42378494143486023, + "epoch": 2.844362745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035807953869066766, + "kl": 0.060972969979047775, + "learning_rate": 8.683684654351597e-09, + "loss": 0.0005, + "num_tokens": 73237694.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4630006551742554, + "sampling/importance_sampling_ratio/mean": 1.000170111656189, + "sampling/importance_sampling_ratio/min": 0.6115177869796753, + "sampling/sampling_logp_difference/max": 0.49181127548217773, + "sampling/sampling_logp_difference/mean": 0.01597459241747856, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 173.15625, + "completions/mean_terminated_length": 173.15625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.32589611411094666, + "epoch": 2.8455882352941178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8919938548263948, + "kl": 0.036309003829956055, + "learning_rate": 8.551993118403656e-09, + "loss": -0.0197, + "num_tokens": 73271976.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.350554347038269, + "sampling/importance_sampling_ratio/mean": 0.9998493790626526, + "sampling/importance_sampling_ratio/min": 0.7448745965957642, + "sampling/sampling_logp_difference/max": 0.30051517486572266, + "sampling/sampling_logp_difference/mean": 0.012146501801908016, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 223.578125, + "completions/mean_terminated_length": 223.578125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3561001420021057, + "epoch": 2.846813725490196, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8875842059271485, + "kl": 0.026764214038848877, + "learning_rate": 8.4212991907755e-09, + "loss": 0.0014, + "num_tokens": 73305693.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.6464452743530273, + "sampling/importance_sampling_ratio/mean": 1.000075101852417, + "sampling/importance_sampling_ratio/min": 0.7136602997779846, + "sampling/sampling_logp_difference/max": 0.49861860275268555, + "sampling/sampling_logp_difference/mean": 0.013426331803202629, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 183.125, + "completions/mean_terminated_length": 183.125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.4191769063472748, + "epoch": 2.8480392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.311661427454457, + "kl": 0.048060342669487, + "learning_rate": 8.291603136767521e-09, + "loss": 0.0336, + "num_tokens": 73333221.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4079670906066895, + "sampling/importance_sampling_ratio/mean": 1.0000182390213013, + "sampling/importance_sampling_ratio/min": 0.7247595191001892, + "sampling/sampling_logp_difference/max": 0.3421468734741211, + "sampling/sampling_logp_difference/mean": 0.015923064202070236, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 162.625, + "completions/mean_terminated_length": 162.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.30206298828125, + "epoch": 2.849264705882353, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019728183790055673, + "kl": 0.02351086027920246, + "learning_rate": 8.16290521965457e-09, + "loss": 0.0002, + "num_tokens": 73357741.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5792306661605835, + "sampling/importance_sampling_ratio/mean": 1.0001418590545654, + "sampling/importance_sampling_ratio/min": 0.6430962681770325, + "sampling/sampling_logp_difference/max": 0.4569377899169922, + "sampling/sampling_logp_difference/mean": 0.013828947208821774, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 158.140625, + "completions/mean_terminated_length": 158.140625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.3143717646598816, + "epoch": 2.8504901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1638472881677286, + "kl": 0.0942501351237297, + "learning_rate": 8.035205700685165e-09, + "loss": 0.0258, + "num_tokens": 73384550.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4685288667678833, + "sampling/importance_sampling_ratio/mean": 0.9998546242713928, + "sampling/importance_sampling_ratio/min": 0.6580463647842407, + "sampling/sampling_logp_difference/max": 0.418479859828949, + "sampling/sampling_logp_difference/mean": 0.014786609448492527, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 201.875, + "completions/mean_terminated_length": 201.875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3219773769378662, + "epoch": 2.8517156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02119687951493217, + "kl": 0.03144240006804466, + "learning_rate": 7.908504839081342e-09, + "loss": 0.0003, + "num_tokens": 73414718.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4369810819625854, + "sampling/importance_sampling_ratio/mean": 0.9997972846031189, + "sampling/importance_sampling_ratio/min": 0.6411973237991333, + "sampling/sampling_logp_difference/max": 0.4444180727005005, + "sampling/sampling_logp_difference/mean": 0.013356061652302742, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 173.53125, + "completions/mean_terminated_length": 173.53125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3775599002838135, + "epoch": 2.8529411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0575776332387867, + "kl": 0.03039197437465191, + "learning_rate": 7.7828028920377e-09, + "loss": 0.0227, + "num_tokens": 73447600.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.290131688117981, + "sampling/importance_sampling_ratio/mean": 0.9999251961708069, + "sampling/importance_sampling_ratio/min": 0.4140039384365082, + "sampling/sampling_logp_difference/max": 0.8818798065185547, + "sampling/sampling_logp_difference/mean": 0.014906775206327438, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 204.375, + "completions/mean_terminated_length": 204.375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.34313100576400757, + "epoch": 2.8541666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01749915624120215, + "kl": 0.025798015296459198, + "learning_rate": 7.658100114721344e-09, + "loss": 0.0003, + "num_tokens": 73477944.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.546187162399292, + "sampling/importance_sampling_ratio/mean": 1.0003581047058105, + "sampling/importance_sampling_ratio/min": 0.6673808693885803, + "sampling/sampling_logp_difference/max": 0.4357919692993164, + "sampling/sampling_logp_difference/mean": 0.013357100076973438, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 260.546875, + "completions/mean_terminated_length": 260.546875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.4380151033401489, + "epoch": 2.855392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7354155385230504, + "kl": 0.029750129207968712, + "learning_rate": 7.534396760270956e-09, + "loss": -0.0146, + "num_tokens": 73516267.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.4922901391983032, + "sampling/importance_sampling_ratio/mean": 1.000532627105713, + "sampling/importance_sampling_ratio/min": 0.670768678188324, + "sampling/sampling_logp_difference/max": 0.4003119468688965, + "sampling/sampling_logp_difference/mean": 0.015685245394706726, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 227.65625, + "completions/mean_terminated_length": 227.65625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.36915498971939087, + "epoch": 2.8566176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013620967683211502, + "kl": 0.02155674248933792, + "learning_rate": 7.411693079796499e-09, + "loss": 0.0002, + "num_tokens": 73548373.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5426087379455566, + "sampling/importance_sampling_ratio/mean": 0.9998866319656372, + "sampling/importance_sampling_ratio/min": 0.6737273335456848, + "sampling/sampling_logp_difference/max": 0.4334750175476074, + "sampling/sampling_logp_difference/mean": 0.013658540323376656, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 175.59375, + "completions/mean_terminated_length": 175.59375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.43928152322769165, + "epoch": 2.857843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03764398945734357, + "kl": 0.0492483451962471, + "learning_rate": 7.289989322378731e-09, + "loss": 0.0005, + "num_tokens": 73577563.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4624541997909546, + "sampling/importance_sampling_ratio/mean": 0.9998334646224976, + "sampling/importance_sampling_ratio/min": 0.654796838760376, + "sampling/sampling_logp_difference/max": 0.4234302043914795, + "sampling/sampling_logp_difference/mean": 0.016958530992269516, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 221.46875, + "completions/mean_terminated_length": 221.46875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.3196999430656433, + "epoch": 2.8590686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046409294879126074, + "kl": 0.030097268521785736, + "learning_rate": 7.169285735068531e-09, + "loss": 0.0003, + "num_tokens": 73611641.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4039427042007446, + "sampling/importance_sampling_ratio/mean": 1.0001153945922852, + "sampling/importance_sampling_ratio/min": 0.6454610824584961, + "sampling/sampling_logp_difference/max": 0.4377903938293457, + "sampling/sampling_logp_difference/mean": 0.01287408173084259, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 176.234375, + "completions/mean_terminated_length": 176.234375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.42879849672317505, + "epoch": 2.860294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03494612726148045, + "kl": 0.04784668609499931, + "learning_rate": 7.049582562886513e-09, + "loss": 0.0005, + "num_tokens": 73636472.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3896585702896118, + "sampling/importance_sampling_ratio/mean": 0.9997230768203735, + "sampling/importance_sampling_ratio/min": 0.6368883848190308, + "sampling/sampling_logp_difference/max": 0.45116090774536133, + "sampling/sampling_logp_difference/mean": 0.01739996112883091, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 223.828125, + "completions/mean_terminated_length": 223.828125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.40986114740371704, + "epoch": 2.861519607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015932745230181365, + "kl": 0.029207296669483185, + "learning_rate": 6.930880048822529e-09, + "loss": 0.0003, + "num_tokens": 73667837.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5521903038024902, + "sampling/importance_sampling_ratio/mean": 1.0004559755325317, + "sampling/importance_sampling_ratio/min": 0.6305375099182129, + "sampling/sampling_logp_difference/max": 0.4611825942993164, + "sampling/sampling_logp_difference/mean": 0.014823229983448982, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 169.953125, + "completions/mean_terminated_length": 169.953125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.44827860593795776, + "epoch": 2.8627450980392157, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.538045583783523, + "kl": 0.07150943577289581, + "learning_rate": 6.813178433835221e-09, + "loss": -0.0011, + "num_tokens": 73690730.0, + "reward": 0.34375, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.395931363105774, + "sampling/importance_sampling_ratio/mean": 0.999872624874115, + "sampling/importance_sampling_ratio/min": 0.6334940195083618, + "sampling/sampling_logp_difference/max": 0.45650482177734375, + "sampling/sampling_logp_difference/mean": 0.01682509109377861, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 177.953125, + "completions/mean_terminated_length": 177.953125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.38354188203811646, + "epoch": 2.8639705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027594890610444674, + "kl": 0.022542208433151245, + "learning_rate": 6.696477956851354e-09, + "loss": 0.0002, + "num_tokens": 73722903.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6809242963790894, + "sampling/importance_sampling_ratio/mean": 1.0005621910095215, + "sampling/importance_sampling_ratio/min": 0.7015100717544556, + "sampling/sampling_logp_difference/max": 0.5193438529968262, + "sampling/sampling_logp_difference/mean": 0.01487412117421627, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 175.1875, + "completions/mean_terminated_length": 175.1875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3849712014198303, + "epoch": 2.8651960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3275793791177526, + "kl": 0.05220656096935272, + "learning_rate": 6.580778854765489e-09, + "loss": 0.0074, + "num_tokens": 73754547.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4683117866516113, + "sampling/importance_sampling_ratio/mean": 0.9991458654403687, + "sampling/importance_sampling_ratio/min": 0.6368657350540161, + "sampling/sampling_logp_difference/max": 0.45119643211364746, + "sampling/sampling_logp_difference/mean": 0.015320195816457272, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 294.125, + "completions/mean_terminated_length": 294.125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.42644044756889343, + "epoch": 2.866421568627451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0108201720755798, + "kl": 0.026010312139987946, + "learning_rate": 6.4660813624395905e-09, + "loss": 0.1167, + "num_tokens": 73793995.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3339488506317139, + "sampling/importance_sampling_ratio/mean": 1.0000989437103271, + "sampling/importance_sampling_ratio/min": 0.5947801470756531, + "sampling/sampling_logp_difference/max": 0.5195634365081787, + "sampling/sampling_logp_difference/mean": 0.014183470979332924, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 182.5, + "completions/mean_terminated_length": 182.5, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3695865273475647, + "epoch": 2.8676470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0166889012835278, + "kl": 0.025707734748721123, + "learning_rate": 6.3523857127021905e-09, + "loss": 0.0002, + "num_tokens": 73824715.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3675557374954224, + "sampling/importance_sampling_ratio/mean": 1.0001277923583984, + "sampling/importance_sampling_ratio/min": 0.6827847361564636, + "sampling/sampling_logp_difference/max": 0.3815755844116211, + "sampling/sampling_logp_difference/mean": 0.014930440112948418, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 188.46875, + "completions/mean_terminated_length": 188.46875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.38538146018981934, + "epoch": 2.868872549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018286261951037185, + "kl": 0.028369126841425896, + "learning_rate": 6.239692136348284e-09, + "loss": 0.0003, + "num_tokens": 73857337.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.411484718322754, + "sampling/importance_sampling_ratio/mean": 1.0004916191101074, + "sampling/importance_sampling_ratio/min": 0.6207948327064514, + "sampling/sampling_logp_difference/max": 0.47675466537475586, + "sampling/sampling_logp_difference/mean": 0.014215872623026371, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 205.453125, + "completions/mean_terminated_length": 205.453125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3979039788246155, + "epoch": 2.8700980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017187819769637448, + "kl": 0.026213616132736206, + "learning_rate": 6.12800086213866e-09, + "loss": 0.0003, + "num_tokens": 73889846.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4555035829544067, + "sampling/importance_sampling_ratio/mean": 1.0002341270446777, + "sampling/importance_sampling_ratio/min": 0.690625786781311, + "sampling/sampling_logp_difference/max": 0.3753519058227539, + "sampling/sampling_logp_difference/mean": 0.014101998880505562, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 209.453125, + "completions/mean_terminated_length": 209.453125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4886457622051239, + "epoch": 2.8713235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031221469579826702, + "kl": 0.049845144152641296, + "learning_rate": 6.017312116799566e-09, + "loss": 0.0006, + "num_tokens": 73920291.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5176514387130737, + "sampling/importance_sampling_ratio/mean": 0.9998294115066528, + "sampling/importance_sampling_ratio/min": 0.6151835918426514, + "sampling/sampling_logp_difference/max": 0.48583459854125977, + "sampling/sampling_logp_difference/mean": 0.016400672495365143, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 135.6875, + "completions/mean_terminated_length": 135.6875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3104097247123718, + "epoch": 2.872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02437347429895532, + "kl": 0.029835911467671394, + "learning_rate": 5.907626125022158e-09, + "loss": 0.0003, + "num_tokens": 73945487.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007180213928223, + "sampling/importance_sampling_ratio/mean": 0.9998512864112854, + "sampling/importance_sampling_ratio/min": 0.6778629422187805, + "sampling/sampling_logp_difference/max": 0.47045230865478516, + "sampling/sampling_logp_difference/mean": 0.014408271759748459, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 160.890625, + "completions/mean_terminated_length": 160.890625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3479102849960327, + "epoch": 2.873774509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022136621297683786, + "kl": 0.025515876710414886, + "learning_rate": 5.798943109461995e-09, + "loss": 0.0002, + "num_tokens": 73972104.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6368871927261353, + "sampling/importance_sampling_ratio/mean": 0.9992555975914001, + "sampling/importance_sampling_ratio/min": 0.6381741166114807, + "sampling/sampling_logp_difference/max": 0.4927964210510254, + "sampling/sampling_logp_difference/mean": 0.014105742797255516, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 198.609375, + "completions/mean_terminated_length": 198.609375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3937147259712219, + "epoch": 2.875, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8726929204105099, + "kl": 0.0347832590341568, + "learning_rate": 5.691263290738824e-09, + "loss": 0.009, + "num_tokens": 74004415.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5203908681869507, + "sampling/importance_sampling_ratio/mean": 1.0003485679626465, + "sampling/importance_sampling_ratio/min": 0.4887605905532837, + "sampling/sampling_logp_difference/max": 0.7158825397491455, + "sampling/sampling_logp_difference/mean": 0.014815937727689743, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 209.984375, + "completions/mean_terminated_length": 209.984375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.31797924637794495, + "epoch": 2.876225490196078, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01062699324841459, + "kl": 0.014805572107434273, + "learning_rate": 5.5845868874357385e-09, + "loss": 0.0001, + "num_tokens": 74039246.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5606545209884644, + "sampling/importance_sampling_ratio/mean": 0.9998297095298767, + "sampling/importance_sampling_ratio/min": 0.6799756288528442, + "sampling/sampling_logp_difference/max": 0.44510531425476074, + "sampling/sampling_logp_difference/mean": 0.01304752379655838, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 228.359375, + "completions/mean_terminated_length": 228.359375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4947429597377777, + "epoch": 2.877450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016340773511117686, + "kl": 0.02197902649641037, + "learning_rate": 5.4789141160991314e-09, + "loss": 0.0002, + "num_tokens": 74077061.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5613102912902832, + "sampling/importance_sampling_ratio/mean": 1.0001163482666016, + "sampling/importance_sampling_ratio/min": 0.6342008113861084, + "sampling/sampling_logp_difference/max": 0.45538973808288574, + "sampling/sampling_logp_difference/mean": 0.017344612628221512, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 175.84375, + "completions/mean_terminated_length": 175.84375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.39359569549560547, + "epoch": 2.8786764705882355, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9849048087035472, + "kl": 0.047245509922504425, + "learning_rate": 5.374245191238025e-09, + "loss": -0.0041, + "num_tokens": 74103291.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4255034923553467, + "sampling/importance_sampling_ratio/mean": 0.9997348785400391, + "sampling/importance_sampling_ratio/min": 0.6171954870223999, + "sampling/sampling_logp_difference/max": 0.48256945610046387, + "sampling/sampling_logp_difference/mean": 0.015792738646268845, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 189.703125, + "completions/mean_terminated_length": 189.703125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.3153325021266937, + "epoch": 2.8799019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6795044307555955, + "kl": 0.02400416135787964, + "learning_rate": 5.270580325323681e-09, + "loss": 0.0213, + "num_tokens": 74134984.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5746557712554932, + "sampling/importance_sampling_ratio/mean": 0.9998120069503784, + "sampling/importance_sampling_ratio/min": 0.6600251793861389, + "sampling/sampling_logp_difference/max": 0.4540367126464844, + "sampling/sampling_logp_difference/mean": 0.011926532723009586, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 204.75, + "completions/mean_terminated_length": 204.75, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.29931190609931946, + "epoch": 2.881127450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013776997919416478, + "kl": 0.01953669637441635, + "learning_rate": 5.167919728789271e-09, + "loss": 0.0002, + "num_tokens": 74164792.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4057984352111816, + "sampling/importance_sampling_ratio/mean": 1.0004501342773438, + "sampling/importance_sampling_ratio/min": 0.6177850961685181, + "sampling/sampling_logp_difference/max": 0.4816145896911621, + "sampling/sampling_logp_difference/mean": 0.013187164440751076, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 204.25, + "completions/mean_terminated_length": 204.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3355157971382141, + "epoch": 2.8823529411764706, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013278609391465654, + "kl": 0.019463874399662018, + "learning_rate": 5.0662636100292086e-09, + "loss": 0.0002, + "num_tokens": 74192504.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.466301679611206, + "sampling/importance_sampling_ratio/mean": 1.0001624822616577, + "sampling/importance_sampling_ratio/min": 0.658620297908783, + "sampling/sampling_logp_difference/max": 0.4176081418991089, + "sampling/sampling_logp_difference/mean": 0.014016900211572647, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 252.5, + "completions/mean_terminated_length": 252.5, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3749135434627533, + "epoch": 2.883578431372549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015083554479084932, + "kl": 0.023390740156173706, + "learning_rate": 4.965612175399092e-09, + "loss": 0.0002, + "num_tokens": 74232696.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.617496132850647, + "sampling/importance_sampling_ratio/mean": 0.99971604347229, + "sampling/importance_sampling_ratio/min": 0.6912395358085632, + "sampling/sampling_logp_difference/max": 0.4808793067932129, + "sampling/sampling_logp_difference/mean": 0.013231323100626469, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 217.40625, + "completions/mean_terminated_length": 217.40625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.375158429145813, + "epoch": 2.8848039215686274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7579212009089664, + "kl": 0.034684307873249054, + "learning_rate": 4.865965629214819e-09, + "loss": 0.0146, + "num_tokens": 74265970.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6056530475616455, + "sampling/importance_sampling_ratio/mean": 0.9999037384986877, + "sampling/importance_sampling_ratio/min": 0.6142159700393677, + "sampling/sampling_logp_difference/max": 0.4874086380004883, + "sampling/sampling_logp_difference/mean": 0.014152498915791512, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 212.359375, + "completions/mean_terminated_length": 212.359375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3427656888961792, + "epoch": 2.8860294117647056, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7357714851817373, + "kl": 0.031769394874572754, + "learning_rate": 4.767324173752696e-09, + "loss": -0.0264, + "num_tokens": 74294793.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4352306127548218, + "sampling/importance_sampling_ratio/mean": 1.0003530979156494, + "sampling/importance_sampling_ratio/min": 0.6146575212478638, + "sampling/sampling_logp_difference/max": 0.48669004440307617, + "sampling/sampling_logp_difference/mean": 0.01346497144550085, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 200.53125, + "completions/mean_terminated_length": 200.53125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.3489496111869812, + "epoch": 2.8872549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017132599119672152, + "kl": 0.023278038948774338, + "learning_rate": 4.669688009248607e-09, + "loss": 0.0002, + "num_tokens": 74327499.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5467315912246704, + "sampling/importance_sampling_ratio/mean": 1.0001590251922607, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.4826626777648926, + "sampling/sampling_logp_difference/mean": 0.013841088861227036, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 208.296875, + "completions/mean_terminated_length": 208.296875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.3163909912109375, + "epoch": 2.888480392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014622783543643882, + "kl": 0.02592034824192524, + "learning_rate": 4.5730573338976786e-09, + "loss": 0.0002, + "num_tokens": 74355950.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5763987302780151, + "sampling/importance_sampling_ratio/mean": 0.9997361302375793, + "sampling/importance_sampling_ratio/min": 0.6836771368980408, + "sampling/sampling_logp_difference/max": 0.4551429748535156, + "sampling/sampling_logp_difference/mean": 0.01313449814915657, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 249.96875, + "completions/mean_terminated_length": 249.96875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.323248028755188, + "epoch": 2.889705882352941, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03207862100978474, + "kl": 0.036106012761592865, + "learning_rate": 4.477432343854226e-09, + "loss": 0.0004, + "num_tokens": 74394956.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6612498760223389, + "sampling/importance_sampling_ratio/mean": 1.0000250339508057, + "sampling/importance_sampling_ratio/min": 0.4179401099681854, + "sampling/sampling_logp_difference/max": 0.8724172115325928, + "sampling/sampling_logp_difference/mean": 0.01283775083720684, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 217.734375, + "completions/mean_terminated_length": 217.734375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.273295134305954, + "epoch": 2.8909313725490198, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8761155246874087, + "kl": 0.020199408754706383, + "learning_rate": 4.382813233230698e-09, + "loss": -0.0175, + "num_tokens": 74425515.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6355791091918945, + "sampling/importance_sampling_ratio/mean": 0.9998001456260681, + "sampling/importance_sampling_ratio/min": 0.6581597924232483, + "sampling/sampling_logp_difference/max": 0.49199700355529785, + "sampling/sampling_logp_difference/mean": 0.011664999648928642, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 186.765625, + "completions/mean_terminated_length": 186.765625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3464294672012329, + "epoch": 2.892156862745098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017237330228140245, + "kl": 0.02178099937736988, + "learning_rate": 4.289200194098119e-09, + "loss": 0.0002, + "num_tokens": 74457436.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5837095975875854, + "sampling/importance_sampling_ratio/mean": 0.9997559189796448, + "sampling/importance_sampling_ratio/min": 0.5434461832046509, + "sampling/sampling_logp_difference/max": 0.6098246574401855, + "sampling/sampling_logp_difference/mean": 0.013934292830526829, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 218.625, + "completions/mean_terminated_length": 218.625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.4872405230998993, + "epoch": 2.8933823529411766, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7353622489765991, + "kl": 0.09670692682266235, + "learning_rate": 4.196593416484873e-09, + "loss": 0.0149, + "num_tokens": 74488004.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.7672679424285889, + "sampling/importance_sampling_ratio/mean": 1.0000057220458984, + "sampling/importance_sampling_ratio/min": 0.6771938800811768, + "sampling/sampling_logp_difference/max": 0.5694348812103271, + "sampling/sampling_logp_difference/mean": 0.016465749591588974, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 192.03125, + "completions/mean_terminated_length": 192.03125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.3528916835784912, + "epoch": 2.894607843137255, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8473547012911232, + "kl": 0.044261232018470764, + "learning_rate": 4.104993088376974e-09, + "loss": -0.0011, + "num_tokens": 74515478.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5398973226547241, + "sampling/importance_sampling_ratio/mean": 1.0009403228759766, + "sampling/importance_sampling_ratio/min": 0.637012243270874, + "sampling/sampling_logp_difference/max": 0.45096635818481445, + "sampling/sampling_logp_difference/mean": 0.014576076529920101, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 211.6875, + "completions/mean_terminated_length": 211.6875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.47120723128318787, + "epoch": 2.8958333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9557901675410637, + "kl": 0.058704107999801636, + "learning_rate": 4.0143993957171826e-09, + "loss": -0.0236, + "num_tokens": 74553778.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4382609128952026, + "sampling/importance_sampling_ratio/mean": 1.0003687143325806, + "sampling/importance_sampling_ratio/min": 0.2649329602718353, + "sampling/sampling_logp_difference/max": 1.3282785415649414, + "sampling/sampling_logp_difference/mean": 0.01628001034259796, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 210.3125, + "completions/mean_terminated_length": 210.3125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3334640860557556, + "epoch": 2.8970588235294117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01333760190161096, + "kl": 0.017741121351718903, + "learning_rate": 3.924812522404952e-09, + "loss": 0.0002, + "num_tokens": 74589862.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007710695266724, + "sampling/importance_sampling_ratio/mean": 1.0000438690185547, + "sampling/importance_sampling_ratio/min": 0.6262631416320801, + "sampling/sampling_logp_difference/max": 0.4704854488372803, + "sampling/sampling_logp_difference/mean": 0.013768445700407028, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 231.015625, + "completions/mean_terminated_length": 231.015625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.31530332565307617, + "epoch": 2.8982843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7073960304740204, + "kl": 0.027371685951948166, + "learning_rate": 3.836232650296034e-09, + "loss": -0.0103, + "num_tokens": 74623879.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.2983132600784302, + "sampling/importance_sampling_ratio/mean": 0.9995091557502747, + "sampling/importance_sampling_ratio/min": 0.648907482624054, + "sampling/sampling_logp_difference/max": 0.4324650764465332, + "sampling/sampling_logp_difference/mean": 0.011638626456260681, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 199.5625, + "completions/mean_terminated_length": 199.5625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.42891377210617065, + "epoch": 2.8995098039215685, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9962438696300998, + "kl": 0.05536392331123352, + "learning_rate": 3.748659959201928e-09, + "loss": 0.0191, + "num_tokens": 74653419.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5813865661621094, + "sampling/importance_sampling_ratio/mean": 1.0003461837768555, + "sampling/importance_sampling_ratio/min": 0.6171907782554626, + "sampling/sampling_logp_difference/max": 0.4825770854949951, + "sampling/sampling_logp_difference/mean": 0.015942316502332687, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 172.546875, + "completions/mean_terminated_length": 172.546875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3402683138847351, + "epoch": 2.900735294117647, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1070267439657486, + "kl": 0.03943384811282158, + "learning_rate": 3.6620946268896556e-09, + "loss": 0.0254, + "num_tokens": 74678990.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9991967678070068, + "sampling/importance_sampling_ratio/min": 0.6189759373664856, + "sampling/sampling_logp_difference/max": 0.709916353225708, + "sampling/sampling_logp_difference/mean": 0.015335087664425373, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 212.890625, + "completions/mean_terminated_length": 212.890625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.28859928250312805, + "epoch": 2.9019607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011678689245716639, + "kl": 0.01910664513707161, + "learning_rate": 3.5765368290813223e-09, + "loss": 0.0002, + "num_tokens": 74711639.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3941144943237305, + "sampling/importance_sampling_ratio/mean": 0.9995889663696289, + "sampling/importance_sampling_ratio/min": 0.6953766942024231, + "sampling/sampling_logp_difference/max": 0.36330151557922363, + "sampling/sampling_logp_difference/mean": 0.01146257109940052, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 186.421875, + "completions/mean_terminated_length": 186.421875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.38304442167282104, + "epoch": 2.903186274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016711314854557412, + "kl": 0.02678157389163971, + "learning_rate": 3.491986739453889e-09, + "loss": 0.0003, + "num_tokens": 74743730.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6008230447769165, + "sampling/importance_sampling_ratio/mean": 0.9997867345809937, + "sampling/importance_sampling_ratio/min": 0.607703685760498, + "sampling/sampling_logp_difference/max": 0.49806785583496094, + "sampling/sampling_logp_difference/mean": 0.015621802769601345, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 144.78125, + "completions/mean_terminated_length": 144.78125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.30216503143310547, + "epoch": 2.9044117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01780466805583866, + "kl": 0.025277363136410713, + "learning_rate": 3.4084445296386767e-09, + "loss": 0.0002, + "num_tokens": 74773316.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4464720487594604, + "sampling/importance_sampling_ratio/mean": 1.0005719661712646, + "sampling/importance_sampling_ratio/min": 0.6782976388931274, + "sampling/sampling_logp_difference/max": 0.3881690502166748, + "sampling/sampling_logp_difference/mean": 0.01319819875061512, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 197.171875, + "completions/mean_terminated_length": 197.171875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.3905153274536133, + "epoch": 2.905637254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019401886283428777, + "kl": 0.027070432901382446, + "learning_rate": 3.3259103692209745e-09, + "loss": 0.0003, + "num_tokens": 74803295.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5972893238067627, + "sampling/importance_sampling_ratio/mean": 0.9990304112434387, + "sampling/importance_sampling_ratio/min": 0.5910945534706116, + "sampling/sampling_logp_difference/max": 0.5257792472839355, + "sampling/sampling_logp_difference/mean": 0.016149362549185753, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 197.3125, + "completions/mean_terminated_length": 197.3125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3718438148498535, + "epoch": 2.906862745098039, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023195987676754584, + "kl": 0.031065698713064194, + "learning_rate": 3.2443844257400434e-09, + "loss": 0.0003, + "num_tokens": 74839683.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5779024362564087, + "sampling/importance_sampling_ratio/mean": 1.0002520084381104, + "sampling/importance_sampling_ratio/min": 0.6434327363967896, + "sampling/sampling_logp_difference/max": 0.4560964107513428, + "sampling/sampling_logp_difference/mean": 0.015002873726189137, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 220.25, + "completions/mean_terminated_length": 220.25, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.35181066393852234, + "epoch": 2.9080882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1243511056485311, + "kl": 0.0297673549503088, + "learning_rate": 3.163866864688336e-09, + "loss": -0.0024, + "num_tokens": 74872547.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5072041749954224, + "sampling/importance_sampling_ratio/mean": 1.0001254081726074, + "sampling/importance_sampling_ratio/min": 0.5031969547271729, + "sampling/sampling_logp_difference/max": 0.6867736577987671, + "sampling/sampling_logp_difference/mean": 0.013822752982378006, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 238.78125, + "completions/mean_terminated_length": 238.78125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.3895314335823059, + "epoch": 2.909313725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014679421412962962, + "kl": 0.018656454980373383, + "learning_rate": 3.0843578495113877e-09, + "loss": 0.0002, + "num_tokens": 74906645.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4728115797042847, + "sampling/importance_sampling_ratio/mean": 0.9996693134307861, + "sampling/importance_sampling_ratio/min": 0.5432451963424683, + "sampling/sampling_logp_difference/max": 0.6101944446563721, + "sampling/sampling_logp_difference/mean": 0.014664944261312485, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 176.171875, + "completions/mean_terminated_length": 176.171875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3867795765399933, + "epoch": 2.9105392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8642869450296733, + "kl": 0.10119754076004028, + "learning_rate": 3.0058575416073707e-09, + "loss": -0.0125, + "num_tokens": 74934032.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6431214809417725, + "sampling/importance_sampling_ratio/mean": 0.9997879266738892, + "sampling/importance_sampling_ratio/min": 0.6962983012199402, + "sampling/sampling_logp_difference/max": 0.4965977668762207, + "sampling/sampling_logp_difference/mean": 0.015408418141305447, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 163.5625, + "completions/mean_terminated_length": 163.5625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3355710208415985, + "epoch": 2.911764705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9124505186964353, + "kl": 0.02697381004691124, + "learning_rate": 2.9283661003270952e-09, + "loss": 0.0412, + "num_tokens": 74962628.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5278233289718628, + "sampling/importance_sampling_ratio/mean": 1.0000895261764526, + "sampling/importance_sampling_ratio/min": 0.6484197974205017, + "sampling/sampling_logp_difference/max": 0.43321692943573, + "sampling/sampling_logp_difference/mean": 0.014956897124648094, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 199.328125, + "completions/mean_terminated_length": 199.328125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.38779014348983765, + "epoch": 2.9129901960784315, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7607617709264936, + "kl": 0.036527037620544434, + "learning_rate": 2.851883682973233e-09, + "loss": 0.0184, + "num_tokens": 74996057.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.2827922105789185, + "sampling/importance_sampling_ratio/mean": 0.9993733763694763, + "sampling/importance_sampling_ratio/min": 0.6301738619804382, + "sampling/sampling_logp_difference/max": 0.4617595672607422, + "sampling/sampling_logp_difference/mean": 0.013738825917243958, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 226.3125, + "completions/mean_terminated_length": 226.3125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.28482919931411743, + "epoch": 2.9142156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01537201761713231, + "kl": 0.024184320122003555, + "learning_rate": 2.776410444800148e-09, + "loss": 0.0003, + "num_tokens": 75030781.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4735807180404663, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.6046412587165833, + "sampling/sampling_logp_difference/max": 0.503119945526123, + "sampling/sampling_logp_difference/mean": 0.01173341367393732, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 184.28125, + "completions/mean_terminated_length": 184.28125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3381809890270233, + "epoch": 2.9154411764705883, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014636593004207615, + "kl": 0.021457120776176453, + "learning_rate": 2.701946539013844e-09, + "loss": 0.0002, + "num_tokens": 75060031.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.530226230621338, + "sampling/importance_sampling_ratio/mean": 0.9997565746307373, + "sampling/importance_sampling_ratio/min": 0.675378680229187, + "sampling/sampling_logp_difference/max": 0.4254155158996582, + "sampling/sampling_logp_difference/mean": 0.014186479151248932, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 183.390625, + "completions/mean_terminated_length": 183.390625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.36331191658973694, + "epoch": 2.9166666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012508191222585525, + "kl": 0.020261242985725403, + "learning_rate": 2.628492116771297e-09, + "loss": 0.0002, + "num_tokens": 75090312.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6600890159606934, + "sampling/importance_sampling_ratio/mean": 1.0003174543380737, + "sampling/importance_sampling_ratio/min": 0.6446958184242249, + "sampling/sampling_logp_difference/max": 0.506871223449707, + "sampling/sampling_logp_difference/mean": 0.01493038795888424, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 212.640625, + "completions/mean_terminated_length": 212.640625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.5538235902786255, + "epoch": 2.917892156862745, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.448151904184562, + "kl": 0.057340092957019806, + "learning_rate": 2.556047327180344e-09, + "loss": -0.0416, + "num_tokens": 75119937.0, + "reward": 0.0625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4229785203933716, + "sampling/importance_sampling_ratio/mean": 1.0000115633010864, + "sampling/importance_sampling_ratio/min": 0.6484609842300415, + "sampling/sampling_logp_difference/max": 0.4331533908843994, + "sampling/sampling_logp_difference/mean": 0.019155602902173996, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 182.578125, + "completions/mean_terminated_length": 182.578125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.48329854011535645, + "epoch": 2.9191176470588234, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8522176133793282, + "kl": 0.0585813894867897, + "learning_rate": 2.484612317299295e-09, + "loss": 0.0062, + "num_tokens": 75149494.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000645399093628, + "sampling/importance_sampling_ratio/min": 0.6106173992156982, + "sampling/sampling_logp_difference/max": 0.8198103904724121, + "sampling/sampling_logp_difference/mean": 0.01875029131770134, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 208.8125, + "completions/mean_terminated_length": 208.8125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3520786762237549, + "epoch": 2.920343137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08546723648294711, + "kl": 0.06635012477636337, + "learning_rate": 2.4141872321367107e-09, + "loss": 0.0006, + "num_tokens": 75178906.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4360222816467285, + "sampling/importance_sampling_ratio/mean": 1.0002968311309814, + "sampling/importance_sampling_ratio/min": 0.70079505443573, + "sampling/sampling_logp_difference/max": 0.3618769645690918, + "sampling/sampling_logp_difference/mean": 0.013945094309747219, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 193.0, + "completions/mean_terminated_length": 193.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3963060975074768, + "epoch": 2.9215686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015982183017377614, + "kl": 0.02367197349667549, + "learning_rate": 2.344772214651014e-09, + "loss": 0.0002, + "num_tokens": 75214250.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6598310470581055, + "sampling/importance_sampling_ratio/mean": 0.9992620944976807, + "sampling/importance_sampling_ratio/min": 0.6274728178977966, + "sampling/sampling_logp_difference/max": 0.5067157745361328, + "sampling/sampling_logp_difference/mean": 0.016099225729703903, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 207.4375, + "completions/mean_terminated_length": 207.4375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.2980409264564514, + "epoch": 2.922794117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021820451302088827, + "kl": 0.020066574215888977, + "learning_rate": 2.2763674057503235e-09, + "loss": 0.0002, + "num_tokens": 75251766.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4847639799118042, + "sampling/importance_sampling_ratio/mean": 1.0004727840423584, + "sampling/importance_sampling_ratio/min": 0.7234383225440979, + "sampling/sampling_logp_difference/max": 0.39525580406188965, + "sampling/sampling_logp_difference/mean": 0.012051548808813095, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 202.546875, + "completions/mean_terminated_length": 202.546875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.3355339765548706, + "epoch": 2.924019607843137, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8187199074871825, + "kl": 0.03872930258512497, + "learning_rate": 2.20897294429212e-09, + "loss": -0.0108, + "num_tokens": 75282153.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5993927717208862, + "sampling/importance_sampling_ratio/mean": 0.9999510049819946, + "sampling/importance_sampling_ratio/min": 0.6995729207992554, + "sampling/sampling_logp_difference/max": 0.46962404251098633, + "sampling/sampling_logp_difference/mean": 0.012686062604188919, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 231.59375, + "completions/mean_terminated_length": 231.59375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.3886439800262451, + "epoch": 2.9252450980392157, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013960937498158493, + "kl": 0.02158696949481964, + "learning_rate": 2.142588967082748e-09, + "loss": 0.0002, + "num_tokens": 75316847.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.774400234222412, + "sampling/importance_sampling_ratio/mean": 1.0002013444900513, + "sampling/importance_sampling_ratio/min": 0.6808310747146606, + "sampling/sampling_logp_difference/max": 0.5734624862670898, + "sampling/sampling_logp_difference/mean": 0.014919614419341087, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 203.4375, + "completions/mean_terminated_length": 203.4375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3687131404876709, + "epoch": 2.9264705882352944, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025827776395270535, + "kl": 0.032721925526857376, + "learning_rate": 2.0772156088776913e-09, + "loss": 0.0003, + "num_tokens": 75346155.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4881680011749268, + "sampling/importance_sampling_ratio/mean": 1.0002375841140747, + "sampling/importance_sampling_ratio/min": 0.6254510879516602, + "sampling/sampling_logp_difference/max": 0.4692821502685547, + "sampling/sampling_logp_difference/mean": 0.014783652499318123, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 203.765625, + "completions/mean_terminated_length": 203.765625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4356057047843933, + "epoch": 2.9276960784313726, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1822634605331237, + "kl": 0.04897632822394371, + "learning_rate": 2.0128530023804656e-09, + "loss": 0.0146, + "num_tokens": 75378860.0, + "reward": 0.75, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4639828205108643, + "sampling/importance_sampling_ratio/mean": 0.9999449849128723, + "sampling/importance_sampling_ratio/min": 0.6783303618431091, + "sampling/sampling_logp_difference/max": 0.3881208896636963, + "sampling/sampling_logp_difference/mean": 0.015538797713816166, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 225.828125, + "completions/mean_terminated_length": 225.828125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.3999805748462677, + "epoch": 2.928921568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017521391739172297, + "kl": 0.024150025099515915, + "learning_rate": 1.9495012782433375e-09, + "loss": 0.0002, + "num_tokens": 75415921.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4013463258743286, + "sampling/importance_sampling_ratio/mean": 0.9993059635162354, + "sampling/importance_sampling_ratio/min": 0.7225843071937561, + "sampling/sampling_logp_difference/max": 0.33743345737457275, + "sampling/sampling_logp_difference/mean": 0.014675119891762733, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 189.90625, + "completions/mean_terminated_length": 189.90625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.32983535528182983, + "epoch": 2.9301470588235294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014103971751140148, + "kl": 0.023350298404693604, + "learning_rate": 1.887160565066048e-09, + "loss": 0.0002, + "num_tokens": 75445355.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6090011596679688, + "sampling/importance_sampling_ratio/mean": 0.9996578693389893, + "sampling/importance_sampling_ratio/min": 0.6511370539665222, + "sampling/sampling_logp_difference/max": 0.4756135940551758, + "sampling/sampling_logp_difference/mean": 0.014014555141329765, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 231.75, + "completions/mean_terminated_length": 231.75, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.2617335915565491, + "epoch": 2.931372549019608, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01586684245297235, + "kl": 0.02428416907787323, + "learning_rate": 1.8258309893965374e-09, + "loss": 0.0003, + "num_tokens": 75482235.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4298864603042603, + "sampling/importance_sampling_ratio/mean": 1.0006804466247559, + "sampling/importance_sampling_ratio/min": 0.7104549407958984, + "sampling/sampling_logp_difference/max": 0.35759496688842773, + "sampling/sampling_logp_difference/mean": 0.010682035237550735, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 170.484375, + "completions/mean_terminated_length": 170.484375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3657550513744354, + "epoch": 2.9325980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02114455024497704, + "kl": 0.02423708140850067, + "learning_rate": 1.7655126757297744e-09, + "loss": 0.0002, + "num_tokens": 75511706.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.403576135635376, + "sampling/importance_sampling_ratio/mean": 0.9998015761375427, + "sampling/importance_sampling_ratio/min": 0.5362949371337891, + "sampling/sampling_logp_difference/max": 0.6230709552764893, + "sampling/sampling_logp_difference/mean": 0.014422647655010223, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 196.265625, + "completions/mean_terminated_length": 196.265625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.3682851493358612, + "epoch": 2.9338235294117645, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020510558490007304, + "kl": 0.037223272025585175, + "learning_rate": 1.7062057465082046e-09, + "loss": 0.0004, + "num_tokens": 75542395.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4761900901794434, + "sampling/importance_sampling_ratio/mean": 0.9994564056396484, + "sampling/importance_sampling_ratio/min": 0.2961808741092682, + "sampling/sampling_logp_difference/max": 1.216784954071045, + "sampling/sampling_logp_difference/mean": 0.01482559833675623, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 193.015625, + "completions/mean_terminated_length": 193.015625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.3677944540977478, + "epoch": 2.935049019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015931602905119236, + "kl": 0.025228401646018028, + "learning_rate": 1.6479103221211377e-09, + "loss": 0.0002, + "num_tokens": 75574124.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4348173141479492, + "sampling/importance_sampling_ratio/mean": 0.9999492764472961, + "sampling/importance_sampling_ratio/min": 0.6410535573959351, + "sampling/sampling_logp_difference/max": 0.4446423053741455, + "sampling/sampling_logp_difference/mean": 0.013891877606511116, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 222.828125, + "completions/mean_terminated_length": 222.828125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.410091757774353, + "epoch": 2.936274509803922, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7605047428407047, + "kl": 0.023822134360671043, + "learning_rate": 1.5906265209045254e-09, + "loss": 0.0033, + "num_tokens": 75606033.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.4722143411636353, + "sampling/importance_sampling_ratio/mean": 0.9998899698257446, + "sampling/importance_sampling_ratio/min": 0.726360559463501, + "sampling/sampling_logp_difference/max": 0.3867676258087158, + "sampling/sampling_logp_difference/mean": 0.014993167482316494, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 204.984375, + "completions/mean_terminated_length": 204.984375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3174899220466614, + "epoch": 2.9375, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8137996587007464, + "kl": 0.022347180172801018, + "learning_rate": 1.534354459140963e-09, + "loss": 0.002, + "num_tokens": 75633344.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.2788511514663696, + "sampling/importance_sampling_ratio/mean": 0.9993703365325928, + "sampling/importance_sampling_ratio/min": 0.6783400774002075, + "sampling/sampling_logp_difference/max": 0.38810646533966064, + "sampling/sampling_logp_difference/mean": 0.012044595554471016, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 194.0625, + "completions/mean_terminated_length": 194.0625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3399444818496704, + "epoch": 2.938725490196078, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8415715402340155, + "kl": 0.03094661235809326, + "learning_rate": 1.4790942510590766e-09, + "loss": -0.0217, + "num_tokens": 75662676.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.626960277557373, + "sampling/importance_sampling_ratio/mean": 1.0001401901245117, + "sampling/importance_sampling_ratio/min": 0.6171442866325378, + "sampling/sampling_logp_difference/max": 0.4867134094238281, + "sampling/sampling_logp_difference/mean": 0.014751039445400238, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 185.984375, + "completions/mean_terminated_length": 185.984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3124786615371704, + "epoch": 2.939950980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9318305373316634, + "kl": 0.027180973440408707, + "learning_rate": 1.4248460088335801e-09, + "loss": -0.0384, + "num_tokens": 75691027.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.3593418598175049, + "sampling/importance_sampling_ratio/mean": 1.0002610683441162, + "sampling/importance_sampling_ratio/min": 0.6063899993896484, + "sampling/sampling_logp_difference/max": 0.5002319812774658, + "sampling/sampling_logp_difference/mean": 0.01331777311861515, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 216.25, + "completions/mean_terminated_length": 216.25, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.3510400056838989, + "epoch": 2.9411764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016127073630881388, + "kl": 0.023105358704924583, + "learning_rate": 1.371609842585053e-09, + "loss": 0.0002, + "num_tokens": 75723395.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5568342208862305, + "sampling/importance_sampling_ratio/mean": 1.0002357959747314, + "sampling/importance_sampling_ratio/min": 0.6176312565803528, + "sampling/sampling_logp_difference/max": 0.48186373710632324, + "sampling/sampling_logp_difference/mean": 0.014547735452651978, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 282.859375, + "completions/mean_terminated_length": 282.859375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.5836253762245178, + "epoch": 2.9424019607843137, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7789305490925427, + "kl": 0.05978118255734444, + "learning_rate": 1.319385860379496e-09, + "loss": -0.0128, + "num_tokens": 75763802.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5122863054275513, + "sampling/importance_sampling_ratio/mean": 0.9999563097953796, + "sampling/importance_sampling_ratio/min": 0.6250794529914856, + "sampling/sampling_logp_difference/max": 0.4698765277862549, + "sampling/sampling_logp_difference/mean": 0.01819756254553795, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 227.0625, + "completions/mean_terminated_length": 227.0625, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3757852017879486, + "epoch": 2.943627450980392, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021447603274242773, + "kl": 0.025004444643855095, + "learning_rate": 1.2681741682282754e-09, + "loss": 0.0002, + "num_tokens": 75792974.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.592671275138855, + "sampling/importance_sampling_ratio/mean": 0.9996393918991089, + "sampling/importance_sampling_ratio/min": 0.6106353402137756, + "sampling/sampling_logp_difference/max": 0.4932553768157959, + "sampling/sampling_logp_difference/mean": 0.014496829360723495, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 241.0, + "completions/mean_terminated_length": 241.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.3505290746688843, + "epoch": 2.9448529411764706, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.758188779896263, + "kl": 0.041052378714084625, + "learning_rate": 1.217974870087901e-09, + "loss": 0.0243, + "num_tokens": 75826094.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5744353532791138, + "sampling/importance_sampling_ratio/mean": 0.9998817443847656, + "sampling/importance_sampling_ratio/min": 0.6181427240371704, + "sampling/sampling_logp_difference/max": 0.4810359477996826, + "sampling/sampling_logp_difference/mean": 0.013801928609609604, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 198.296875, + "completions/mean_terminated_length": 198.296875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.5521295070648193, + "epoch": 2.946078431372549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.212770049450111, + "kl": 0.05674951523542404, + "learning_rate": 1.1687880678596939e-09, + "loss": -0.0075, + "num_tokens": 75864305.0, + "reward": 0.6875, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.461962103843689, + "sampling/importance_sampling_ratio/mean": 1.0005958080291748, + "sampling/importance_sampling_ratio/min": 0.6441908478736877, + "sampling/sampling_logp_difference/max": 0.4397602081298828, + "sampling/sampling_logp_difference/mean": 0.018468894064426422, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 165.1875, + "completions/mean_terminated_length": 165.1875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.3110926151275635, + "epoch": 2.9473039215686274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0208300178904979, + "kl": 0.02462848648428917, + "learning_rate": 1.1206138613898962e-09, + "loss": 0.0002, + "num_tokens": 75890061.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5485175848007202, + "sampling/importance_sampling_ratio/mean": 1.0005801916122437, + "sampling/importance_sampling_ratio/min": 0.7139657139778137, + "sampling/sampling_logp_difference/max": 0.437298059463501, + "sampling/sampling_logp_difference/mean": 0.013580387458205223, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 175.25, + "completions/mean_terminated_length": 175.25, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.31713682413101196, + "epoch": 2.9485294117647056, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019186981424148625, + "kl": 0.025584455579519272, + "learning_rate": 1.0734523484689507e-09, + "loss": 0.0003, + "num_tokens": 75921037.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.600778341293335, + "sampling/importance_sampling_ratio/mean": 1.0001356601715088, + "sampling/importance_sampling_ratio/min": 0.622538149356842, + "sampling/sampling_logp_difference/max": 0.4739503860473633, + "sampling/sampling_logp_difference/mean": 0.01394292339682579, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 203.53125, + "completions/mean_terminated_length": 203.53125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.3787577748298645, + "epoch": 2.9497549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013635619765529598, + "kl": 0.020073357969522476, + "learning_rate": 1.0273036248318324e-09, + "loss": 0.0002, + "num_tokens": 75952031.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3519020080566406, + "sampling/importance_sampling_ratio/mean": 0.9997328519821167, + "sampling/importance_sampling_ratio/min": 0.6203981637954712, + "sampling/sampling_logp_difference/max": 0.47739386558532715, + "sampling/sampling_logp_difference/mean": 0.0135762644931674, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 216.046875, + "completions/mean_terminated_length": 216.046875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.3667415976524353, + "epoch": 2.950980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0279535717611835, + "kl": 0.03812876343727112, + "learning_rate": 9.82167784157495e-10, + "loss": 0.0003, + "num_tokens": 75981202.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.413762092590332, + "sampling/importance_sampling_ratio/mean": 0.9999436140060425, + "sampling/importance_sampling_ratio/min": 0.6165905594825745, + "sampling/sampling_logp_difference/max": 0.4835500717163086, + "sampling/sampling_logp_difference/mean": 0.013796709477901459, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 195.453125, + "completions/mean_terminated_length": 195.453125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.41977283358573914, + "epoch": 2.952205882352941, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8157663726977482, + "kl": 0.14047285914421082, + "learning_rate": 9.380449180688143e-10, + "loss": 0.0211, + "num_tokens": 76010527.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.545351266860962, + "sampling/importance_sampling_ratio/mean": 1.0002124309539795, + "sampling/importance_sampling_ratio/min": 0.7128937244415283, + "sampling/sampling_logp_difference/max": 0.43525123596191406, + "sampling/sampling_logp_difference/mean": 0.015495553612709045, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 222.78125, + "completions/mean_terminated_length": 222.78125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.32274726033210754, + "epoch": 2.9534313725490198, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.012670030737707082, + "kl": 0.015602516010403633, + "learning_rate": 8.949351161324225e-10, + "loss": 0.0002, + "num_tokens": 76046641.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4755103588104248, + "sampling/importance_sampling_ratio/mean": 1.0002577304840088, + "sampling/importance_sampling_ratio/min": 0.6246485710144043, + "sampling/sampling_logp_difference/max": 0.4705660343170166, + "sampling/sampling_logp_difference/mean": 0.0123797208070755, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 157.46875, + "completions/mean_terminated_length": 157.46875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3183969557285309, + "epoch": 2.954656862745098, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9697665139391681, + "kl": 0.02980830706655979, + "learning_rate": 8.528384658584853e-10, + "loss": -0.0261, + "num_tokens": 76071455.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4153097867965698, + "sampling/importance_sampling_ratio/mean": 0.9993491172790527, + "sampling/importance_sampling_ratio/min": 0.6604135632514954, + "sampling/sampling_logp_difference/max": 0.4148890972137451, + "sampling/sampling_logp_difference/mean": 0.013639282435178757, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 212.640625, + "completions/mean_terminated_length": 212.640625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.33786875009536743, + "epoch": 2.9558823529411766, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020922592339934605, + "kl": 0.027387060225009918, + "learning_rate": 8.117550527005912e-10, + "loss": 0.0003, + "num_tokens": 76101128.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.659095048904419, + "sampling/importance_sampling_ratio/mean": 1.0002970695495605, + "sampling/importance_sampling_ratio/min": 0.6153237819671631, + "sampling/sampling_logp_difference/max": 0.5062723159790039, + "sampling/sampling_logp_difference/mean": 0.012486254796385765, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 170.1875, + "completions/mean_terminated_length": 170.1875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.4340662360191345, + "epoch": 2.957107843137255, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03034505904552388, + "kl": 0.05263839662075043, + "learning_rate": 7.716849600554188e-10, + "loss": 0.0005, + "num_tokens": 76129108.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4997516870498657, + "sampling/importance_sampling_ratio/mean": 1.000171184539795, + "sampling/importance_sampling_ratio/min": 0.6809601187705994, + "sampling/sampling_logp_difference/max": 0.4052995443344116, + "sampling/sampling_logp_difference/mean": 0.016778860241174698, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 201.8125, + "completions/mean_terminated_length": 201.8125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.3196262717247009, + "epoch": 2.9583333333333335, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7345859601982961, + "kl": 0.0631108433008194, + "learning_rate": 7.326282692626806e-10, + "loss": 0.0169, + "num_tokens": 76156360.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.7401350736618042, + "sampling/importance_sampling_ratio/mean": 1.0001612901687622, + "sampling/importance_sampling_ratio/min": 0.5697087049484253, + "sampling/sampling_logp_difference/max": 0.5626300573348999, + "sampling/sampling_logp_difference/mean": 0.012970691546797752, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 250.78125, + "completions/mean_terminated_length": 250.78125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.4877535104751587, + "epoch": 2.9595588235294117, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.654849821949973, + "kl": 0.03357920050621033, + "learning_rate": 6.945850596050684e-10, + "loss": 0.0072, + "num_tokens": 76188730.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5156501531600952, + "sampling/importance_sampling_ratio/mean": 0.9997298717498779, + "sampling/importance_sampling_ratio/min": 0.660491943359375, + "sampling/sampling_logp_difference/max": 0.4158444404602051, + "sampling/sampling_logp_difference/mean": 0.016852904111146927, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 180.421875, + "completions/mean_terminated_length": 180.421875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3513135313987732, + "epoch": 2.9607843137254903, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9464599334498442, + "kl": 0.04228626936674118, + "learning_rate": 6.575554083078083e-10, + "loss": 0.0394, + "num_tokens": 76216757.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.612879991531372, + "sampling/importance_sampling_ratio/mean": 1.00016450881958, + "sampling/importance_sampling_ratio/min": 0.5362210273742676, + "sampling/sampling_logp_difference/max": 0.62320876121521, + "sampling/sampling_logp_difference/mean": 0.014640099368989468, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 170.515625, + "completions/mean_terminated_length": 170.515625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.29498258233070374, + "epoch": 2.9620098039215685, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015930040609186152, + "kl": 0.022072361782193184, + "learning_rate": 6.215393905388278e-10, + "loss": 0.0002, + "num_tokens": 76243846.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5760538578033447, + "sampling/importance_sampling_ratio/mean": 1.0002655982971191, + "sampling/importance_sampling_ratio/min": 0.614662230014801, + "sampling/sampling_logp_difference/max": 0.4866824150085449, + "sampling/sampling_logp_difference/mean": 0.013572830706834793, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 209.9375, + "completions/mean_terminated_length": 209.9375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.30850571393966675, + "epoch": 2.963235294117647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01826592571893516, + "kl": 0.024756554514169693, + "learning_rate": 5.865370794082558e-10, + "loss": 0.0002, + "num_tokens": 76275330.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4055031538009644, + "sampling/importance_sampling_ratio/mean": 1.0001401901245117, + "sampling/importance_sampling_ratio/min": 0.6222241520881653, + "sampling/sampling_logp_difference/max": 0.4744548797607422, + "sampling/sampling_logp_difference/mean": 0.01286984235048294, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 196.625, + "completions/mean_terminated_length": 196.625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.3975851535797119, + "epoch": 2.9644607843137254, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020887113552843894, + "kl": 0.028526470065116882, + "learning_rate": 5.525485459687007e-10, + "loss": 0.0003, + "num_tokens": 76305338.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6464240550994873, + "sampling/importance_sampling_ratio/mean": 1.0008063316345215, + "sampling/importance_sampling_ratio/min": 0.6181831359863281, + "sampling/sampling_logp_difference/max": 0.49860572814941406, + "sampling/sampling_logp_difference/mean": 0.016536317765712738, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 239.671875, + "completions/mean_terminated_length": 239.671875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.37244483828544617, + "epoch": 2.965686274509804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013444198409433008, + "kl": 0.018550623208284378, + "learning_rate": 5.195738592145838e-10, + "loss": 0.0002, + "num_tokens": 76350549.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7199466228485107, + "sampling/importance_sampling_ratio/mean": 0.9998754858970642, + "sampling/importance_sampling_ratio/min": 0.678856611251831, + "sampling/sampling_logp_difference/max": 0.5422933101654053, + "sampling/sampling_logp_difference/mean": 0.014810443855822086, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 237.78125, + "completions/mean_terminated_length": 237.78125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.36133065819740295, + "epoch": 2.9669117647058822, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015155280847526294, + "kl": 0.02569812536239624, + "learning_rate": 4.876130860825278e-10, + "loss": 0.0003, + "num_tokens": 76386711.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5359033346176147, + "sampling/importance_sampling_ratio/mean": 1.0001566410064697, + "sampling/importance_sampling_ratio/min": 0.6060562133789062, + "sampling/sampling_logp_difference/max": 0.5007824897766113, + "sampling/sampling_logp_difference/mean": 0.013830517418682575, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 182.875, + "completions/mean_terminated_length": 182.875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3214268684387207, + "epoch": 2.968137254901961, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01717020213442181, + "kl": 0.020274590700864792, + "learning_rate": 4.566662914508579e-10, + "loss": 0.0002, + "num_tokens": 76415615.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3767173290252686, + "sampling/importance_sampling_ratio/mean": 0.9997262954711914, + "sampling/importance_sampling_ratio/min": 0.4835146367549896, + "sampling/sampling_logp_difference/max": 0.7266737222671509, + "sampling/sampling_logp_difference/mean": 0.014127014204859734, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 223.078125, + "completions/mean_terminated_length": 223.078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.35409465432167053, + "epoch": 2.969362745098039, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.825861705013576, + "kl": 0.034467682242393494, + "learning_rate": 4.267335381396564e-10, + "loss": 0.0101, + "num_tokens": 76454516.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.384948492050171, + "sampling/importance_sampling_ratio/mean": 1.0002353191375732, + "sampling/importance_sampling_ratio/min": 0.6298385858535767, + "sampling/sampling_logp_difference/max": 0.4622917175292969, + "sampling/sampling_logp_difference/mean": 0.013544151559472084, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 181.875, + "completions/mean_terminated_length": 181.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.3696131110191345, + "epoch": 2.9705882352941178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2415214478536805, + "kl": 0.06753590703010559, + "learning_rate": 3.978148869103748e-10, + "loss": 0.0131, + "num_tokens": 76484764.0, + "reward": 0.3125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.5278277397155762, + "sampling/importance_sampling_ratio/mean": 0.9996430277824402, + "sampling/importance_sampling_ratio/min": 0.6368826031684875, + "sampling/sampling_logp_difference/max": 0.4511699676513672, + "sampling/sampling_logp_difference/mean": 0.014389928430318832, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 159.84375, + "completions/mean_terminated_length": 159.84375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.35769280791282654, + "epoch": 2.971813725490196, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01811837475727411, + "kl": 0.02568848803639412, + "learning_rate": 3.699103964661665e-10, + "loss": 0.0003, + "num_tokens": 76526946.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.564016580581665, + "sampling/importance_sampling_ratio/mean": 0.9992374181747437, + "sampling/importance_sampling_ratio/min": 0.6108652353286743, + "sampling/sampling_logp_difference/max": 0.49287891387939453, + "sampling/sampling_logp_difference/mean": 0.014937417581677437, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 160.6875, + "completions/mean_terminated_length": 160.6875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.36411798000335693, + "epoch": 2.9730392156862746, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9997063482295577, + "kl": 0.059302765876054764, + "learning_rate": 3.430201234513874e-10, + "loss": 0.0068, + "num_tokens": 76550814.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4970269203186035, + "sampling/importance_sampling_ratio/mean": 1.0000662803649902, + "sampling/importance_sampling_ratio/min": 0.5362949371337891, + "sampling/sampling_logp_difference/max": 0.6230709552764893, + "sampling/sampling_logp_difference/mean": 0.015909433364868164, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 200.65625, + "completions/mean_terminated_length": 200.65625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.39167535305023193, + "epoch": 2.974264705882353, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9152092662765233, + "kl": 0.03770778328180313, + "learning_rate": 3.171441224514848e-10, + "loss": -0.0107, + "num_tokens": 76582056.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.416238784790039, + "sampling/importance_sampling_ratio/mean": 1.0002343654632568, + "sampling/importance_sampling_ratio/min": 0.6949491500854492, + "sampling/sampling_logp_difference/max": 0.36391663551330566, + "sampling/sampling_logp_difference/mean": 0.015838623046875, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 210.21875, + "completions/mean_terminated_length": 210.21875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.3457028567790985, + "epoch": 2.9754901960784315, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014345896259431879, + "kl": 0.019017256796360016, + "learning_rate": 2.922824459931639e-10, + "loss": 0.0002, + "num_tokens": 76616086.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4058324098587036, + "sampling/importance_sampling_ratio/mean": 1.0000433921813965, + "sampling/importance_sampling_ratio/min": 0.5289757251739502, + "sampling/sampling_logp_difference/max": 0.6368128061294556, + "sampling/sampling_logp_difference/mean": 0.013697953894734383, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 216.125, + "completions/mean_terminated_length": 216.125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.4409083127975464, + "epoch": 2.9767156862745097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024087873197584663, + "kl": 0.0422600694000721, + "learning_rate": 2.684351445440547e-10, + "loss": 0.0004, + "num_tokens": 76652878.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5682049989700317, + "sampling/importance_sampling_ratio/mean": 1.0005720853805542, + "sampling/importance_sampling_ratio/min": 0.6955130696296692, + "sampling/sampling_logp_difference/max": 0.44993162155151367, + "sampling/sampling_logp_difference/mean": 0.015224607661366463, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 187.703125, + "completions/mean_terminated_length": 187.703125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.47271329164505005, + "epoch": 2.9779411764705883, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9832786884744623, + "kl": 0.06448041647672653, + "learning_rate": 2.456022665127122e-10, + "loss": 0.0155, + "num_tokens": 76689019.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.7292759418487549, + "sampling/importance_sampling_ratio/mean": 1.0006279945373535, + "sampling/importance_sampling_ratio/min": 0.7186931371688843, + "sampling/sampling_logp_difference/max": 0.5477027893066406, + "sampling/sampling_logp_difference/mean": 0.017432495951652527, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 188.703125, + "completions/mean_terminated_length": 188.703125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.3937327563762665, + "epoch": 2.9791666666666665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017894899872886165, + "kl": 0.029064467176795006, + "learning_rate": 2.2378385824833866e-10, + "loss": 0.0003, + "num_tokens": 76721544.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.588956594467163, + "sampling/importance_sampling_ratio/mean": 1.0005124807357788, + "sampling/importance_sampling_ratio/min": 0.6262632608413696, + "sampling/sampling_logp_difference/max": 0.4679844379425049, + "sampling/sampling_logp_difference/mean": 0.016251537948846817, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 211.125, + "completions/mean_terminated_length": 211.125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.43084716796875, + "epoch": 2.980392156862745, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7980041187157573, + "kl": 0.034850817173719406, + "learning_rate": 2.0297996404095018e-10, + "loss": 0.0219, + "num_tokens": 76752128.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.5484894514083862, + "sampling/importance_sampling_ratio/mean": 1.0004076957702637, + "sampling/importance_sampling_ratio/min": 0.6262337565422058, + "sampling/sampling_logp_difference/max": 0.468031644821167, + "sampling/sampling_logp_difference/mean": 0.016249552369117737, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 209.078125, + "completions/mean_terminated_length": 209.078125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.37195223569869995, + "epoch": 2.9816176470588234, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018372951542397854, + "kl": 0.022795135155320168, + "learning_rate": 1.8319062612115467e-10, + "loss": 0.0002, + "num_tokens": 76783445.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4479608535766602, + "sampling/importance_sampling_ratio/mean": 0.9999375343322754, + "sampling/importance_sampling_ratio/min": 0.6661897301673889, + "sampling/sampling_logp_difference/max": 0.406180739402771, + "sampling/sampling_logp_difference/mean": 0.01377858780324459, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 195.15625, + "completions/mean_terminated_length": 195.15625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3060455918312073, + "epoch": 2.982843137254902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013220137123882814, + "kl": 0.020647821947932243, + "learning_rate": 1.6441588466009627e-10, + "loss": 0.0002, + "num_tokens": 76812511.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.862382411956787, + "sampling/importance_sampling_ratio/mean": 0.9997444152832031, + "sampling/importance_sampling_ratio/min": 0.6732436418533325, + "sampling/sampling_logp_difference/max": 0.6218565702438354, + "sampling/sampling_logp_difference/mean": 0.013443908654153347, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 192.8125, + "completions/mean_terminated_length": 192.8125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.38746321201324463, + "epoch": 2.9840686274509802, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040917264826652325, + "kl": 0.06290300190448761, + "learning_rate": 1.4665577776923343e-10, + "loss": 0.0007, + "num_tokens": 76844179.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6007455587387085, + "sampling/importance_sampling_ratio/mean": 0.9994732737541199, + "sampling/importance_sampling_ratio/min": 0.6137450933456421, + "sampling/sampling_logp_difference/max": 0.488175630569458, + "sampling/sampling_logp_difference/mean": 0.015084675513207912, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 180.890625, + "completions/mean_terminated_length": 180.890625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.39325374364852905, + "epoch": 2.985294117647059, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026781616326749733, + "kl": 0.04768332093954086, + "learning_rate": 1.2991034150050538e-10, + "loss": 0.0005, + "num_tokens": 76871644.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6221674680709839, + "sampling/importance_sampling_ratio/mean": 1.000661849975586, + "sampling/importance_sampling_ratio/min": 0.6309764981269836, + "sampling/sampling_logp_difference/max": 0.4837632179260254, + "sampling/sampling_logp_difference/mean": 0.014678483828902245, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 230.140625, + "completions/mean_terminated_length": 230.140625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3076133131980896, + "epoch": 2.986519607843137, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022439207037943637, + "kl": 0.027489451691508293, + "learning_rate": 1.1417960984605457e-10, + "loss": 0.0003, + "num_tokens": 76902037.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6770782470703125, + "sampling/importance_sampling_ratio/mean": 1.0006099939346313, + "sampling/importance_sampling_ratio/min": 0.5068971514701843, + "sampling/sampling_logp_difference/max": 0.6794471740722656, + "sampling/sampling_logp_difference/mean": 0.013162676244974136, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 240.21875, + "completions/mean_terminated_length": 240.21875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.33798423409461975, + "epoch": 2.9877450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6925080574839301, + "kl": 0.02133062668144703, + "learning_rate": 9.946361473822662e-11, + "loss": 0.015, + "num_tokens": 76936771.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4189066886901855, + "sampling/importance_sampling_ratio/mean": 1.0010058879852295, + "sampling/importance_sampling_ratio/min": 0.6178327202796936, + "sampling/sampling_logp_difference/max": 0.4815375804901123, + "sampling/sampling_logp_difference/mean": 0.01256520114839077, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 260.65625, + "completions/mean_terminated_length": 260.65625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.3749733567237854, + "epoch": 2.9889705882352944, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7822023535083346, + "kl": 0.031120579689741135, + "learning_rate": 8.576238604968144e-11, + "loss": -0.0014, + "num_tokens": 76973629.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4561256170272827, + "sampling/importance_sampling_ratio/mean": 0.9998626112937927, + "sampling/importance_sampling_ratio/min": 0.5609079599380493, + "sampling/sampling_logp_difference/max": 0.5781984329223633, + "sampling/sampling_logp_difference/mean": 0.013883860781788826, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 182.15625, + "completions/mean_terminated_length": 182.15625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.36464837193489075, + "epoch": 2.9901960784313726, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7809426881687429, + "kl": 0.040025513619184494, + "learning_rate": 7.307595159300461e-11, + "loss": -0.0036, + "num_tokens": 77005191.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4753769636154175, + "sampling/importance_sampling_ratio/mean": 0.9999262094497681, + "sampling/importance_sampling_ratio/min": 0.5693280696868896, + "sampling/sampling_logp_difference/max": 0.5632984638214111, + "sampling/sampling_logp_difference/mean": 0.014043103903532028, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 207.9375, + "completions/mean_terminated_length": 207.9375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.4231107831001282, + "epoch": 2.991421568627451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022112389211604168, + "kl": 0.029684830456972122, + "learning_rate": 6.140433712076287e-11, + "loss": 0.0003, + "num_tokens": 77039411.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5629370212554932, + "sampling/importance_sampling_ratio/mean": 1.0006803274154663, + "sampling/importance_sampling_ratio/min": 0.6103478074073792, + "sampling/sampling_logp_difference/max": 0.4937262535095215, + "sampling/sampling_logp_difference/mean": 0.01552946213632822, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 237.328125, + "completions/mean_terminated_length": 237.328125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.3809570372104645, + "epoch": 2.9926470588235294, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9849094616363699, + "kl": 0.06580809503793716, + "learning_rate": 5.074756632572619e-11, + "loss": -0.0084, + "num_tokens": 77071656.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4351915121078491, + "sampling/importance_sampling_ratio/mean": 0.999637246131897, + "sampling/importance_sampling_ratio/min": 0.5690277218818665, + "sampling/sampling_logp_difference/max": 0.5638261437416077, + "sampling/sampling_logp_difference/mean": 0.013923577964305878, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 209.34375, + "completions/mean_terminated_length": 209.34375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.40023350715637207, + "epoch": 2.993872549019608, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9166245589220879, + "kl": 0.02449093759059906, + "learning_rate": 4.110566084036815e-11, + "loss": -0.0108, + "num_tokens": 77101838.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.649056315422058, + "sampling/importance_sampling_ratio/mean": 0.9999035000801086, + "sampling/importance_sampling_ratio/min": 0.6530977487564087, + "sampling/sampling_logp_difference/max": 0.5002031326293945, + "sampling/sampling_logp_difference/mean": 0.015382511541247368, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 207.734375, + "completions/mean_terminated_length": 207.734375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.44583308696746826, + "epoch": 2.9950980392156863, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03138498208642566, + "kl": 0.0639105960726738, + "learning_rate": 3.247864023719904e-11, + "loss": 0.0006, + "num_tokens": 77130973.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5488362312316895, + "sampling/importance_sampling_ratio/mean": 0.9999371767044067, + "sampling/importance_sampling_ratio/min": 0.6546967029571533, + "sampling/sampling_logp_difference/max": 0.4375038146972656, + "sampling/sampling_logp_difference/mean": 0.016407610848546028, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 158.953125, + "completions/mean_terminated_length": 158.953125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.3797130882740021, + "epoch": 2.9963235294117645, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9006243160778002, + "kl": 0.06772524863481522, + "learning_rate": 2.4866522028488268e-11, + "loss": 0.0037, + "num_tokens": 77157978.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.424027919769287, + "sampling/importance_sampling_ratio/mean": 0.999600887298584, + "sampling/importance_sampling_ratio/min": 0.7135008573532104, + "sampling/sampling_logp_difference/max": 0.35348939895629883, + "sampling/sampling_logp_difference/mean": 0.015492400154471397, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 179.671875, + "completions/mean_terminated_length": 179.671875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.44988203048706055, + "epoch": 2.997549019607843, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04407334586845134, + "kl": 0.05289856344461441, + "learning_rate": 1.8269321666375403e-11, + "loss": 0.0004, + "num_tokens": 77186725.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4753925800323486, + "sampling/importance_sampling_ratio/mean": 0.9999063611030579, + "sampling/importance_sampling_ratio/min": 0.7180938720703125, + "sampling/sampling_logp_difference/max": 0.38892412185668945, + "sampling/sampling_logp_difference/mean": 0.017196040600538254, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 216.796875, + "completions/mean_terminated_length": 216.796875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.3979523181915283, + "epoch": 2.998774509803922, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021870598023562535, + "kl": 0.03111506626009941, + "learning_rate": 1.2687052542759147e-11, + "loss": 0.0003, + "num_tokens": 77219448.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5071574449539185, + "sampling/importance_sampling_ratio/mean": 0.9996896982192993, + "sampling/importance_sampling_ratio/min": 0.574481725692749, + "sampling/sampling_logp_difference/max": 0.5542869567871094, + "sampling/sampling_logp_difference/mean": 0.015271512791514397, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 195.546875, + "completions/mean_terminated_length": 195.546875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.35060396790504456, + "epoch": 3.0, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02021418566622152, + "kl": 0.025219064205884933, + "learning_rate": 8.119725989241822e-12, + "loss": 0.0002, + "num_tokens": 77246587.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6122491359710693, + "sampling/importance_sampling_ratio/mean": 1.0004022121429443, + "sampling/importance_sampling_ratio/min": 0.6406969428062439, + "sampling/sampling_logp_difference/max": 0.4776301383972168, + "sampling/sampling_logp_difference/mean": 0.014867585152387619, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 207.796875, + "completions/mean_terminated_length": 207.796875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.3712385594844818, + "epoch": 3.0012254901960786, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3733932754087725, + "kl": 0.030660737305879593, + "learning_rate": 4.5673512772959055e-12, + "loss": 0.0457, + "num_tokens": 77281118.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998342990875244, + "sampling/importance_sampling_ratio/min": 0.6906341314315796, + "sampling/sampling_logp_difference/max": 0.7512912750244141, + "sampling/sampling_logp_difference/mean": 0.014142373576760292, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 145.359375, + "completions/mean_terminated_length": 145.359375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.4449942708015442, + "epoch": 3.002450980392157, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.020521614562299, + "kl": 0.061585891991853714, + "learning_rate": 2.0299356179309666e-12, + "loss": 0.0156, + "num_tokens": 77314037.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 0.9996541142463684, + "sampling/importance_sampling_ratio/min": 0.6051085591316223, + "sampling/sampling_logp_difference/max": 0.502347469329834, + "sampling/sampling_logp_difference/mean": 0.01772310584783554, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 205.875, + "completions/mean_terminated_length": 205.875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.33885860443115234, + "epoch": 3.0036764705882355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018110718259315723, + "kl": 0.02658119425177574, + "learning_rate": 5.074841620267278e-13, + "loss": 0.0003, + "num_tokens": 77342493.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0003312826156616, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.9316283464431763, + "sampling/sampling_logp_difference/mean": 0.013981176540255547, + "step": 2451 + } + ], + "logging_steps": 1, + "max_steps": 2451, + "num_input_tokens_seen": 77342493, + "num_train_epochs": 4, + "save_steps": 817, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}