{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0036764705882355, "eval_steps": 500, "global_step": 2451, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.28229743242263794, "epoch": 0.0012254901960784314, "frac_reward_zero_std": 0.75, "grad_norm": 0.7479166325555309, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0063, "num_tokens": 29328.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5957260131835938, "sampling/importance_sampling_ratio/mean": 1.0002766847610474, "sampling/importance_sampling_ratio/min": 0.6407270431518555, "sampling/sampling_logp_difference/max": 0.4673287868499756, "sampling/sampling_logp_difference/mean": 0.013949751853942871, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3556089997291565, "epoch": 0.0024509803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.065040650406504e-09, "loss": 0.0, "num_tokens": 56536.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6171398162841797, "sampling/importance_sampling_ratio/mean": 1.0003067255020142, "sampling/importance_sampling_ratio/min": 0.6687158942222595, "sampling/sampling_logp_difference/max": 0.48065900802612305, "sampling/sampling_logp_difference/mean": 0.016711918637156487, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 199.578125, "completions/mean_terminated_length": 199.578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4099624752998352, "epoch": 0.003676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.002462603523173163, "kl": 0.0005740458145737648, "learning_rate": 8.130081300813008e-09, "loss": 0.0, "num_tokens": 87501.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.384435772895813, "sampling/importance_sampling_ratio/mean": 1.000922679901123, "sampling/importance_sampling_ratio/min": 0.6132686734199524, "sampling/sampling_logp_difference/max": 0.4889521598815918, "sampling/sampling_logp_difference/mean": 0.017119944095611572, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.38932323455810547, "epoch": 0.004901960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 0.8683691777653537, "kl": 0.00048791104927659035, "learning_rate": 1.2195121951219512e-08, "loss": -0.0066, "num_tokens": 121221.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8221690654754639, "sampling/importance_sampling_ratio/mean": 0.999820351600647, "sampling/importance_sampling_ratio/min": 0.6315130591392517, "sampling/sampling_logp_difference/max": 0.6000275611877441, "sampling/sampling_logp_difference/mean": 0.015830399468541145, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 242.5625, "completions/mean_terminated_length": 242.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3684294819831848, "epoch": 0.006127450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.0458740847410508, "kl": 0.0005367250414565206, "learning_rate": 1.6260162601626016e-08, "loss": 0.0137, "num_tokens": 161993.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6047180891036987, "sampling/importance_sampling_ratio/mean": 1.0001524686813354, "sampling/importance_sampling_ratio/min": 0.6081744432449341, "sampling/sampling_logp_difference/max": 0.49729347229003906, "sampling/sampling_logp_difference/mean": 0.015667764469981194, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 174.859375, "completions/mean_terminated_length": 174.859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.355221152305603, "epoch": 0.007352941176470588, "frac_reward_zero_std": 0.75, "grad_norm": 0.9879967709804532, "kl": 0.0006942846230231225, "learning_rate": 2.032520325203252e-08, "loss": -0.006, "num_tokens": 188608.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6463251113891602, "sampling/importance_sampling_ratio/mean": 1.000199317932129, "sampling/importance_sampling_ratio/min": 0.5885282158851624, "sampling/sampling_logp_difference/max": 0.5301303863525391, "sampling/sampling_logp_difference/mean": 0.016422923654317856, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 210.875, "completions/mean_terminated_length": 210.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3294309675693512, "epoch": 0.00857843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.38987441779447, "kl": 0.0005754978628829122, "learning_rate": 2.4390243902439023e-08, "loss": 0.0212, "num_tokens": 220840.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.618558406829834, "sampling/importance_sampling_ratio/mean": 0.9997539520263672, "sampling/importance_sampling_ratio/min": 0.638060986995697, "sampling/sampling_logp_difference/max": 0.4815359115600586, "sampling/sampling_logp_difference/mean": 0.014915119856595993, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 173.65625, "completions/mean_terminated_length": 173.65625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.33604592084884644, "epoch": 0.00980392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.002185939782348619, "kl": 0.0005290215485729277, "learning_rate": 2.8455284552845527e-08, "loss": 0.0, "num_tokens": 249602.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2978038787841797, "sampling/importance_sampling_ratio/mean": 1.0001516342163086, "sampling/importance_sampling_ratio/min": 0.6171172261238098, "sampling/sampling_logp_difference/max": 0.4826962947845459, "sampling/sampling_logp_difference/mean": 0.014389926567673683, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 185.3125, "completions/mean_terminated_length": 185.3125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.376762330532074, "epoch": 0.011029411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.2913004060570497, "kl": 0.0006159727345220745, "learning_rate": 3.252032520325203e-08, "loss": 0.0153, "num_tokens": 283590.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3472496271133423, "sampling/importance_sampling_ratio/mean": 0.9995624423027039, "sampling/importance_sampling_ratio/min": 0.6078652739524841, "sampling/sampling_logp_difference/max": 0.4978019595146179, "sampling/sampling_logp_difference/mean": 0.016848571598529816, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 196.53125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.36589953303337097, "epoch": 0.012254901960784314, "frac_reward_zero_std": 0.75, "grad_norm": 0.9788700484841335, "kl": 0.0006352112395688891, "learning_rate": 3.658536585365853e-08, "loss": -0.0314, "num_tokens": 316776.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4398318529129028, "sampling/importance_sampling_ratio/mean": 0.9998399019241333, "sampling/importance_sampling_ratio/min": 0.6124805212020874, "sampling/sampling_logp_difference/max": 0.4902381896972656, "sampling/sampling_logp_difference/mean": 0.016363628208637238, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 188.140625, "completions/mean_terminated_length": 188.140625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3949275016784668, "epoch": 0.013480392156862746, "frac_reward_zero_std": 0.25, "grad_norm": 1.742921810360062, "kl": 0.0007006666273809969, "learning_rate": 4.065040650406504e-08, "loss": 0.0184, "num_tokens": 345569.0, "reward": 0.3125, "reward_std": 0.6116957664489746, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6221922636032104, "sampling/importance_sampling_ratio/mean": 1.0000474452972412, "sampling/importance_sampling_ratio/min": 0.6204392910003662, "sampling/sampling_logp_difference/max": 0.4837784767150879, "sampling/sampling_logp_difference/mean": 0.017850767821073532, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 208.46875, "completions/mean_terminated_length": 208.46875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.31998831033706665, "epoch": 0.014705882352941176, "frac_reward_zero_std": 0.5, "grad_norm": 1.1855691193334834, "kl": 0.0006962069310247898, "learning_rate": 4.4715447154471546e-08, "loss": -0.0278, "num_tokens": 375807.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5744433403015137, "sampling/importance_sampling_ratio/mean": 0.9997034072875977, "sampling/importance_sampling_ratio/min": 0.6303018927574158, "sampling/sampling_logp_difference/max": 0.46155643463134766, "sampling/sampling_logp_difference/mean": 0.014594350941479206, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 172.109375, "completions/mean_terminated_length": 172.109375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3764714300632477, "epoch": 0.015931372549019607, "frac_reward_zero_std": 0.75, "grad_norm": 1.0914280328623476, "kl": 0.0006829964695498347, "learning_rate": 4.878048780487805e-08, "loss": 0.0069, "num_tokens": 401190.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.623640537261963, "sampling/importance_sampling_ratio/mean": 1.0012812614440918, "sampling/importance_sampling_ratio/min": 0.6074984669685364, "sampling/sampling_logp_difference/max": 0.49840569496154785, "sampling/sampling_logp_difference/mean": 0.017124183475971222, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 174.265625, "completions/mean_terminated_length": 174.265625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.31244346499443054, "epoch": 0.01715686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.0529020435196639, "kl": 0.0006743941339664161, "learning_rate": 5.2845528455284554e-08, "loss": -0.0149, "num_tokens": 427351.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5630708932876587, "sampling/importance_sampling_ratio/mean": 1.0000364780426025, "sampling/importance_sampling_ratio/min": 0.6112802028656006, "sampling/sampling_logp_difference/max": 0.4921998977661133, "sampling/sampling_logp_difference/mean": 0.01569565385580063, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 186.65625, "completions/mean_terminated_length": 186.65625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2303016185760498, "epoch": 0.01838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.002993495547442926, "kl": 0.0005023834528401494, "learning_rate": 5.6910569105691055e-08, "loss": 0.0, "num_tokens": 456849.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.596113681793213, "sampling/importance_sampling_ratio/mean": 0.9996975660324097, "sampling/importance_sampling_ratio/min": 0.48819631338119507, "sampling/sampling_logp_difference/max": 0.7170376777648926, "sampling/sampling_logp_difference/mean": 0.011747606098651886, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 219.90625, "completions/mean_terminated_length": 219.90625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.42675280570983887, "epoch": 0.0196078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9331486011235821, "kl": 0.0005129431374371052, "learning_rate": 6.097560975609756e-08, "loss": 0.0012, "num_tokens": 496571.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6089171171188354, "sampling/importance_sampling_ratio/mean": 1.0008577108383179, "sampling/importance_sampling_ratio/min": 0.6093137860298157, "sampling/sampling_logp_difference/max": 0.4954218864440918, "sampling/sampling_logp_difference/mean": 0.017686408013105392, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 181.96875, "completions/mean_terminated_length": 181.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3967221975326538, "epoch": 0.020833333333333332, "frac_reward_zero_std": 0.5, "grad_norm": 1.401103340712179, "kl": 0.0006027425406500697, "learning_rate": 6.504065040650406e-08, "loss": 0.0146, "num_tokens": 524617.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6117963790893555, "sampling/importance_sampling_ratio/mean": 1.0005756616592407, "sampling/importance_sampling_ratio/min": 0.6582546830177307, "sampling/sampling_logp_difference/max": 0.47734928131103516, "sampling/sampling_logp_difference/mean": 0.016756337136030197, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 185.65625, "completions/mean_terminated_length": 185.65625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2854838967323303, "epoch": 0.022058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027790260821808736, "kl": 0.000610773335210979, "learning_rate": 6.910569105691057e-08, "loss": 0.0, "num_tokens": 551203.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5293587446212769, "sampling/importance_sampling_ratio/mean": 0.9998745322227478, "sampling/importance_sampling_ratio/min": 0.6232456564903259, "sampling/sampling_logp_difference/max": 0.47281455993652344, "sampling/sampling_logp_difference/mean": 0.014073856174945831, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 225.515625, "completions/mean_terminated_length": 225.515625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4128747284412384, "epoch": 0.023284313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.1387017471697445, "kl": 0.0005442682886496186, "learning_rate": 7.317073170731706e-08, "loss": 0.0208, "num_tokens": 586180.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6147140264511108, "sampling/importance_sampling_ratio/mean": 0.9997754693031311, "sampling/importance_sampling_ratio/min": 0.4225813150405884, "sampling/sampling_logp_difference/max": 0.8613734245300293, "sampling/sampling_logp_difference/mean": 0.016178175806999207, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.34866586327552795, "epoch": 0.024509803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.127891737670114, "kl": 0.0005755886086262763, "learning_rate": 7.723577235772358e-08, "loss": 0.0189, "num_tokens": 617060.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.489638328552246, "sampling/importance_sampling_ratio/mean": 0.9997918009757996, "sampling/importance_sampling_ratio/min": 0.6267635226249695, "sampling/sampling_logp_difference/max": 0.46718597412109375, "sampling/sampling_logp_difference/mean": 0.015172285959124565, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3468027412891388, "epoch": 0.025735294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.9147208366057095, "kl": 0.0005272025009617209, "learning_rate": 8.130081300813008e-08, "loss": -0.0192, "num_tokens": 645860.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.622986912727356, "sampling/importance_sampling_ratio/mean": 0.9999600052833557, "sampling/importance_sampling_ratio/min": 0.6193138360977173, "sampling/sampling_logp_difference/max": 0.4842681884765625, "sampling/sampling_logp_difference/mean": 0.01507607288658619, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 180.984375, "completions/mean_terminated_length": 180.984375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.29409506916999817, "epoch": 0.02696078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.071189239381354, "kl": 0.0004567463183775544, "learning_rate": 8.536585365853659e-08, "loss": -0.0218, "num_tokens": 677347.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5818450450897217, "sampling/importance_sampling_ratio/mean": 0.9996393918991089, "sampling/importance_sampling_ratio/min": 0.4838048219680786, "sampling/sampling_logp_difference/max": 0.7260737419128418, "sampling/sampling_logp_difference/mean": 0.013134635984897614, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 227.15625, "completions/mean_terminated_length": 227.15625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4192052483558655, "epoch": 0.028186274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.8350320483904694, "kl": 0.000575724639929831, "learning_rate": 8.943089430894309e-08, "loss": -0.0138, "num_tokens": 711885.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4994769096374512, "sampling/importance_sampling_ratio/mean": 0.9990925192832947, "sampling/importance_sampling_ratio/min": 0.37036260962486267, "sampling/sampling_logp_difference/max": 0.9932727813720703, "sampling/sampling_logp_difference/mean": 0.017731059342622757, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 182.71875, "completions/mean_terminated_length": 182.71875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3272732198238373, "epoch": 0.029411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.1614412937586946, "kl": 0.0005397280328907073, "learning_rate": 9.349593495934959e-08, "loss": 0.0425, "num_tokens": 739595.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5641429424285889, "sampling/importance_sampling_ratio/mean": 0.9999706745147705, "sampling/importance_sampling_ratio/min": 0.6439087986946106, "sampling/sampling_logp_difference/max": 0.4473381042480469, "sampling/sampling_logp_difference/mean": 0.01354675181210041, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.46684208512306213, "epoch": 0.030637254901960783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031829255161353784, "kl": 0.0006086308276280761, "learning_rate": 9.75609756097561e-08, "loss": 0.0, "num_tokens": 774955.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6248189210891724, "sampling/importance_sampling_ratio/mean": 0.9997011423110962, "sampling/importance_sampling_ratio/min": 0.6368674039840698, "sampling/sampling_logp_difference/max": 0.4853963851928711, "sampling/sampling_logp_difference/mean": 0.018731631338596344, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 177.09375, "completions/mean_terminated_length": 177.09375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.26852524280548096, "epoch": 0.031862745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.003296172078637039, "kl": 0.0005129881319589913, "learning_rate": 1.016260162601626e-07, "loss": 0.0, "num_tokens": 802769.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277632474899292, "sampling/importance_sampling_ratio/mean": 1.000476598739624, "sampling/importance_sampling_ratio/min": 0.6301831007003784, "sampling/sampling_logp_difference/max": 0.4617447853088379, "sampling/sampling_logp_difference/mean": 0.012203315272927284, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.28598493337631226, "epoch": 0.03308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017730737089246148, "kl": 0.00047270144568756223, "learning_rate": 1.0569105691056911e-07, "loss": 0.0, "num_tokens": 838137.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6515837907791138, "sampling/importance_sampling_ratio/mean": 1.000171422958374, "sampling/importance_sampling_ratio/min": 0.6097134351730347, "sampling/sampling_logp_difference/max": 0.501734733581543, "sampling/sampling_logp_difference/mean": 0.013479228131473064, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 202.046875, "completions/mean_terminated_length": 202.046875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4291865825653076, "epoch": 0.03431372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.7712373761585439, "kl": 0.0006866075564175844, "learning_rate": 1.097560975609756e-07, "loss": 0.0068, "num_tokens": 867660.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.7520534992218018, "sampling/importance_sampling_ratio/mean": 0.999358594417572, "sampling/importance_sampling_ratio/min": 0.6255505681037903, "sampling/sampling_logp_difference/max": 0.5607885122299194, "sampling/sampling_logp_difference/mean": 0.018599187955260277, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 194.671875, "completions/mean_terminated_length": 194.671875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3803984522819519, "epoch": 0.03553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.3284860712625153, "kl": 0.0005068336031399667, "learning_rate": 1.1382113821138211e-07, "loss": -0.0224, "num_tokens": 898327.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.7105915546417236, "sampling/importance_sampling_ratio/mean": 0.9996547698974609, "sampling/importance_sampling_ratio/min": 0.6176413893699646, "sampling/sampling_logp_difference/max": 0.5368392467498779, "sampling/sampling_logp_difference/mean": 0.015175139531493187, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 170.203125, "completions/mean_terminated_length": 170.203125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3811706006526947, "epoch": 0.03676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.1298075740240563, "kl": 0.0007677193498238921, "learning_rate": 1.1788617886178862e-07, "loss": 0.001, "num_tokens": 922660.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6084727048873901, "sampling/importance_sampling_ratio/mean": 1.0000172853469849, "sampling/importance_sampling_ratio/min": 0.6318622827529907, "sampling/sampling_logp_difference/max": 0.47528505325317383, "sampling/sampling_logp_difference/mean": 0.01936884969472885, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 196.421875, "completions/mean_terminated_length": 196.421875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3232734501361847, "epoch": 0.03799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.002693476953860462, "kl": 0.000636767887044698, "learning_rate": 1.219512195121951e-07, "loss": 0.0, "num_tokens": 950223.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6622308492660522, "sampling/importance_sampling_ratio/mean": 1.000096321105957, "sampling/importance_sampling_ratio/min": 0.6301569938659668, "sampling/sampling_logp_difference/max": 0.5081605911254883, "sampling/sampling_logp_difference/mean": 0.015153428539633751, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 182.0625, "completions/mean_terminated_length": 182.0625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3069888949394226, "epoch": 0.0392156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030196274756335035, "kl": 0.0006638698978349566, "learning_rate": 1.260162601626016e-07, "loss": 0.0, "num_tokens": 986755.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8526816368103027, "sampling/importance_sampling_ratio/mean": 1.000173807144165, "sampling/importance_sampling_ratio/min": 0.6228201985359192, "sampling/sampling_logp_difference/max": 0.6166341304779053, "sampling/sampling_logp_difference/mean": 0.014696375466883183, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 222.28125, "completions/mean_terminated_length": 222.28125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3425103425979614, "epoch": 0.04044117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 0.7401089766711731, "kl": 0.0005140869179740548, "learning_rate": 1.3008130081300813e-07, "loss": 0.0137, "num_tokens": 1019797.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6463189125061035, "sampling/importance_sampling_ratio/mean": 0.9994274377822876, "sampling/importance_sampling_ratio/min": 0.6264773011207581, "sampling/sampling_logp_difference/max": 0.49854183197021484, "sampling/sampling_logp_difference/mean": 0.014486259780824184, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 220.4375, "completions/mean_terminated_length": 220.4375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.35915452241897583, "epoch": 0.041666666666666664, "frac_reward_zero_std": 0.5, "grad_norm": 1.4135222586157743, "kl": 0.0005540281417779624, "learning_rate": 1.3414634146341465e-07, "loss": -0.0628, "num_tokens": 1053873.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.629637360572815, "sampling/importance_sampling_ratio/mean": 1.0001004934310913, "sampling/importance_sampling_ratio/min": 0.5777447819709778, "sampling/sampling_logp_difference/max": 0.5486230850219727, "sampling/sampling_logp_difference/mean": 0.014785278588533401, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 195.3125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3688523769378662, "epoch": 0.0428921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9374397758001798, "kl": 0.000617923797108233, "learning_rate": 1.3821138211382114e-07, "loss": 0.0148, "num_tokens": 1086149.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3920141458511353, "sampling/importance_sampling_ratio/mean": 0.9999972581863403, "sampling/importance_sampling_ratio/min": 0.622948169708252, "sampling/sampling_logp_difference/max": 0.47329187393188477, "sampling/sampling_logp_difference/mean": 0.01586540974676609, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 170.046875, "completions/mean_terminated_length": 170.046875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.35302817821502686, "epoch": 0.04411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.002595660226809006, "kl": 0.000633524265140295, "learning_rate": 1.4227642276422763e-07, "loss": 0.0, "num_tokens": 1117704.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6610043048858643, "sampling/importance_sampling_ratio/mean": 0.999608039855957, "sampling/importance_sampling_ratio/min": 0.6538148522377014, "sampling/sampling_logp_difference/max": 0.5074224472045898, "sampling/sampling_logp_difference/mean": 0.016183484345674515, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.45303717255592346, "epoch": 0.04534313725490196, "frac_reward_zero_std": 0.25, "grad_norm": 1.60602766171904, "kl": 0.0005130674107931554, "learning_rate": 1.4634146341463413e-07, "loss": 0.0166, "num_tokens": 1151184.0, "reward": 0.4375, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3496991395950317, "sampling/importance_sampling_ratio/mean": 1.0000017881393433, "sampling/importance_sampling_ratio/min": 0.6291665434837341, "sampling/sampling_logp_difference/max": 0.46335935592651367, "sampling/sampling_logp_difference/mean": 0.01684834063053131, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 218.3125, "completions/mean_terminated_length": 218.3125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3316297233104706, "epoch": 0.04656862745098039, "frac_reward_zero_std": 0.25, "grad_norm": 1.3069275862513603, "kl": 0.0005636264686472714, "learning_rate": 1.5040650406504065e-07, "loss": -0.0063, "num_tokens": 1185940.0, "reward": 0.28125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.495247483253479, "sampling/importance_sampling_ratio/mean": 1.000683307647705, "sampling/importance_sampling_ratio/min": 0.6321820616722107, "sampling/sampling_logp_difference/max": 0.45857787132263184, "sampling/sampling_logp_difference/mean": 0.014935510233044624, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 212.828125, "completions/mean_terminated_length": 212.828125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.43204957246780396, "epoch": 0.04779411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.1820639433995703, "kl": 0.0007093902095220983, "learning_rate": 1.5447154471544717e-07, "loss": 0.0263, "num_tokens": 1213913.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.620730996131897, "sampling/importance_sampling_ratio/mean": 1.0002366304397583, "sampling/importance_sampling_ratio/min": 0.6387297511100769, "sampling/sampling_logp_difference/max": 0.482877254486084, "sampling/sampling_logp_difference/mean": 0.017711803317070007, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 191.453125, "completions/mean_terminated_length": 191.453125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4706189036369324, "epoch": 0.049019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024790988863974115, "kl": 0.0006636378820985556, "learning_rate": 1.5853658536585366e-07, "loss": 0.0, "num_tokens": 1244230.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.614878535270691, "sampling/importance_sampling_ratio/mean": 1.0000481605529785, "sampling/importance_sampling_ratio/min": 0.6086853742599487, "sampling/sampling_logp_difference/max": 0.49645376205444336, "sampling/sampling_logp_difference/mean": 0.017970986664295197, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4288448393344879, "epoch": 0.05024509803921569, "frac_reward_zero_std": 0.0, "grad_norm": 1.6378922477411386, "kl": 0.0005438131629489362, "learning_rate": 1.6260162601626016e-07, "loss": 0.027, "num_tokens": 1273534.0, "reward": 0.25, "reward_std": 0.7623475193977356, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6325310468673706, "sampling/importance_sampling_ratio/mean": 0.9998799562454224, "sampling/importance_sampling_ratio/min": 0.5624527931213379, "sampling/sampling_logp_difference/max": 0.5754480361938477, "sampling/sampling_logp_difference/mean": 0.01654253527522087, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 187.546875, "completions/mean_terminated_length": 187.546875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3429829180240631, "epoch": 0.051470588235294115, "frac_reward_zero_std": 0.75, "grad_norm": 0.885346806108457, "kl": 0.0006208082195371389, "learning_rate": 1.6666666666666665e-07, "loss": 0.0071, "num_tokens": 1303201.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.473521113395691, "sampling/importance_sampling_ratio/mean": 1.0007667541503906, "sampling/importance_sampling_ratio/min": 0.679732620716095, "sampling/sampling_logp_difference/max": 0.38765478134155273, "sampling/sampling_logp_difference/mean": 0.015743326395750046, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3522500991821289, "epoch": 0.05269607843137255, "frac_reward_zero_std": 0.5, "grad_norm": 1.5506841207393454, "kl": 0.0006877593696117401, "learning_rate": 1.7073170731707317e-07, "loss": 0.0157, "num_tokens": 1327577.0, "reward": 0.53125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4399316310882568, "sampling/importance_sampling_ratio/mean": 0.99951171875, "sampling/importance_sampling_ratio/min": 0.5969815254211426, "sampling/sampling_logp_difference/max": 0.515869140625, "sampling/sampling_logp_difference/mean": 0.016115540638566017, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 185.1875, "completions/mean_terminated_length": 185.1875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.40408167243003845, "epoch": 0.05392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.9268900876701384, "kl": 0.0006102619227021933, "learning_rate": 1.7479674796747966e-07, "loss": -0.0044, "num_tokens": 1357861.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6210482120513916, "sampling/importance_sampling_ratio/mean": 0.9999095797538757, "sampling/importance_sampling_ratio/min": 0.6182252764701843, "sampling/sampling_logp_difference/max": 0.48307299613952637, "sampling/sampling_logp_difference/mean": 0.01602936163544655, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 231.109375, "completions/mean_terminated_length": 231.109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.42715632915496826, "epoch": 0.05514705882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.1764580024959685, "kl": 0.0006182037759572268, "learning_rate": 1.7886178861788619e-07, "loss": -0.0593, "num_tokens": 1402540.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6230255365371704, "sampling/importance_sampling_ratio/mean": 1.0002163648605347, "sampling/importance_sampling_ratio/min": 0.5318665504455566, "sampling/sampling_logp_difference/max": 0.6313626766204834, "sampling/sampling_logp_difference/mean": 0.017635690048336983, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 172.46875, "completions/mean_terminated_length": 172.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3358648121356964, "epoch": 0.056372549019607844, "frac_reward_zero_std": 0.75, "grad_norm": 1.0010274065990643, "kl": 0.0006443964084610343, "learning_rate": 1.8292682926829268e-07, "loss": 0.0242, "num_tokens": 1425498.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4986183643341064, "sampling/importance_sampling_ratio/mean": 0.9992099404335022, "sampling/importance_sampling_ratio/min": 0.6184446215629578, "sampling/sampling_logp_difference/max": 0.4805476665496826, "sampling/sampling_logp_difference/mean": 0.016474541276693344, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3795761466026306, "epoch": 0.05759803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029523160938459856, "kl": 0.0006913819233886898, "learning_rate": 1.8699186991869917e-07, "loss": 0.0, "num_tokens": 1453602.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4270973205566406, "sampling/importance_sampling_ratio/mean": 0.9996356964111328, "sampling/importance_sampling_ratio/min": 0.6219667792320251, "sampling/sampling_logp_difference/max": 0.4748685359954834, "sampling/sampling_logp_difference/mean": 0.016697335988283157, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.28151488304138184, "epoch": 0.058823529411764705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028058067350995694, "kl": 0.000663014012388885, "learning_rate": 1.910569105691057e-07, "loss": 0.0, "num_tokens": 1480122.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.623085856437683, "sampling/importance_sampling_ratio/mean": 1.0003767013549805, "sampling/importance_sampling_ratio/min": 0.6483780145645142, "sampling/sampling_logp_difference/max": 0.4843292236328125, "sampling/sampling_logp_difference/mean": 0.014155544340610504, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 170.28125, "completions/mean_terminated_length": 170.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.43864426016807556, "epoch": 0.06004901960784314, "frac_reward_zero_std": 0.75, "grad_norm": 0.9550066819946098, "kl": 0.0007230397895909846, "learning_rate": 1.951219512195122e-07, "loss": -0.0301, "num_tokens": 1509404.0, "reward": -0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4191150665283203, "sampling/importance_sampling_ratio/mean": 1.0003002882003784, "sampling/importance_sampling_ratio/min": 0.37228211760520935, "sampling/sampling_logp_difference/max": 0.9881033897399902, "sampling/sampling_logp_difference/mean": 0.0182917769998312, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 173.3125, "completions/mean_terminated_length": 173.3125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3217679262161255, "epoch": 0.061274509803921566, "frac_reward_zero_std": 1.0, "grad_norm": 0.004175269613817593, "kl": 0.0007267352193593979, "learning_rate": 1.9918699186991868e-07, "loss": 0.0, "num_tokens": 1537168.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3975340127944946, "sampling/importance_sampling_ratio/mean": 1.000385046005249, "sampling/importance_sampling_ratio/min": 0.4469026029109955, "sampling/sampling_logp_difference/max": 0.8054146766662598, "sampling/sampling_logp_difference/mean": 0.015700964257121086, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 177.296875, "completions/mean_terminated_length": 177.296875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4188961982727051, "epoch": 0.0625, "frac_reward_zero_std": 0.5, "grad_norm": 1.397865288577748, "kl": 0.0008356262696906924, "learning_rate": 2.032520325203252e-07, "loss": -0.0179, "num_tokens": 1566051.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8231755495071411, "sampling/importance_sampling_ratio/mean": 1.0001540184020996, "sampling/importance_sampling_ratio/min": 0.6267947554588318, "sampling/sampling_logp_difference/max": 0.6005797386169434, "sampling/sampling_logp_difference/mean": 0.0184800885617733, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 214.90625, "completions/mean_terminated_length": 214.90625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4530605673789978, "epoch": 0.06372549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.2814797926932124, "kl": 0.000573880213778466, "learning_rate": 2.073170731707317e-07, "loss": 0.0593, "num_tokens": 1605629.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4754071235656738, "sampling/importance_sampling_ratio/mean": 0.9998928308486938, "sampling/importance_sampling_ratio/min": 0.5536072850227356, "sampling/sampling_logp_difference/max": 0.5912997722625732, "sampling/sampling_logp_difference/mean": 0.01821010746061802, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 186.6875, "completions/mean_terminated_length": 186.6875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3925783634185791, "epoch": 0.06495098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030464339417726537, "kl": 0.0006757756927981973, "learning_rate": 2.1138211382113822e-07, "loss": 0.0, "num_tokens": 1639193.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8943507671356201, "sampling/importance_sampling_ratio/mean": 0.9993544816970825, "sampling/importance_sampling_ratio/min": 0.6423728466033936, "sampling/sampling_logp_difference/max": 0.63887619972229, "sampling/sampling_logp_difference/mean": 0.01692541316151619, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 213.59375, "completions/mean_terminated_length": 213.59375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4210030734539032, "epoch": 0.0661764705882353, "frac_reward_zero_std": 0.25, "grad_norm": 1.5213873136283464, "kl": 0.000580059364438057, "learning_rate": 2.154471544715447e-07, "loss": 0.0368, "num_tokens": 1668063.0, "reward": 0.21875, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4799457788467407, "sampling/importance_sampling_ratio/mean": 0.9999606609344482, "sampling/importance_sampling_ratio/min": 0.4670323133468628, "sampling/sampling_logp_difference/max": 0.7613568305969238, "sampling/sampling_logp_difference/mean": 0.016792941838502884, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 192.5, "completions/mean_terminated_length": 192.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.39150482416152954, "epoch": 0.06740196078431372, "frac_reward_zero_std": 0.5, "grad_norm": 1.3123382300934188, "kl": 0.0006600832566618919, "learning_rate": 2.195121951219512e-07, "loss": 0.0659, "num_tokens": 1705231.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5363892316818237, "sampling/importance_sampling_ratio/mean": 1.0001909732818604, "sampling/importance_sampling_ratio/min": 0.610666036605835, "sampling/sampling_logp_difference/max": 0.49320507049560547, "sampling/sampling_logp_difference/mean": 0.01660531759262085, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 182.3125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.31077295541763306, "epoch": 0.06862745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.002448791989654552, "kl": 0.0006591076962649822, "learning_rate": 2.235772357723577e-07, "loss": 0.0, "num_tokens": 1735811.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.789772629737854, "sampling/importance_sampling_ratio/mean": 0.9997566938400269, "sampling/importance_sampling_ratio/min": 0.4267900586128235, "sampling/sampling_logp_difference/max": 0.8514630794525146, "sampling/sampling_logp_difference/mean": 0.015317326411604881, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 224.578125, "completions/mean_terminated_length": 224.578125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3348657488822937, "epoch": 0.06985294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.7502445285201713, "kl": 0.0005730668781325221, "learning_rate": 2.2764227642276422e-07, "loss": 0.0215, "num_tokens": 1767112.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5211312770843506, "sampling/importance_sampling_ratio/mean": 1.0004832744598389, "sampling/importance_sampling_ratio/min": 0.5496712327003479, "sampling/sampling_logp_difference/max": 0.5984349250793457, "sampling/sampling_logp_difference/mean": 0.014861776493489742, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 156.359375, "completions/mean_terminated_length": 156.359375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3165697455406189, "epoch": 0.07107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9152807101002652, "kl": 0.0006824785377830267, "learning_rate": 2.3170731707317074e-07, "loss": 0.0046, "num_tokens": 1790991.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6023153066635132, "sampling/importance_sampling_ratio/mean": 0.9994411468505859, "sampling/importance_sampling_ratio/min": 0.6273468732833862, "sampling/sampling_logp_difference/max": 0.471449613571167, "sampling/sampling_logp_difference/mean": 0.015117138624191284, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.23887406289577484, "epoch": 0.07230392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033072866061355233, "kl": 0.0006151001434773207, "learning_rate": 2.3577235772357723e-07, "loss": 0.0, "num_tokens": 1817723.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6221637725830078, "sampling/importance_sampling_ratio/mean": 1.0002944469451904, "sampling/importance_sampling_ratio/min": 0.6680719256401062, "sampling/sampling_logp_difference/max": 0.4837608337402344, "sampling/sampling_logp_difference/mean": 0.01238096784800291, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 189.90625, "completions/mean_terminated_length": 189.90625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.36334773898124695, "epoch": 0.07352941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.1494354820963015, "kl": 0.0006102257175371051, "learning_rate": 2.3983739837398373e-07, "loss": -0.0286, "num_tokens": 1844341.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.499464750289917, "sampling/importance_sampling_ratio/mean": 1.0000648498535156, "sampling/importance_sampling_ratio/min": 0.6169834733009338, "sampling/sampling_logp_difference/max": 0.4829130172729492, "sampling/sampling_logp_difference/mean": 0.015710413455963135, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 226.703125, "completions/mean_terminated_length": 226.703125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2888544797897339, "epoch": 0.07475490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.002962674547903842, "kl": 0.0005360324867069721, "learning_rate": 2.439024390243902e-07, "loss": 0.0, "num_tokens": 1878674.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4062519073486328, "sampling/importance_sampling_ratio/mean": 1.000124454498291, "sampling/importance_sampling_ratio/min": 0.4441189169883728, "sampling/sampling_logp_difference/max": 0.8116629123687744, "sampling/sampling_logp_difference/mean": 0.013115715235471725, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 181.46875, "completions/mean_terminated_length": 181.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.34271660447120667, "epoch": 0.07598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.005683725741503254, "kl": 0.0007463833317160606, "learning_rate": 2.479674796747967e-07, "loss": 0.0, "num_tokens": 1907968.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5646599531173706, "sampling/importance_sampling_ratio/mean": 0.9999079704284668, "sampling/importance_sampling_ratio/min": 0.3687233030796051, "sampling/sampling_logp_difference/max": 0.997708797454834, "sampling/sampling_logp_difference/mean": 0.016180504113435745, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 158.671875, "completions/mean_terminated_length": 158.671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3199722468852997, "epoch": 0.07720588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 1.213606157980981, "kl": 0.0006946529028937221, "learning_rate": 2.520325203252032e-07, "loss": -0.0029, "num_tokens": 1932379.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5072897672653198, "sampling/importance_sampling_ratio/mean": 1.0006909370422363, "sampling/importance_sampling_ratio/min": 0.6118065118789673, "sampling/sampling_logp_difference/max": 0.49133920669555664, "sampling/sampling_logp_difference/mean": 0.015027320943772793, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 208.796875, "completions/mean_terminated_length": 208.796875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.416431188583374, "epoch": 0.0784313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.8846816716558332, "kl": 0.0005686272052116692, "learning_rate": 2.5609756097560976e-07, "loss": 0.0138, "num_tokens": 1966670.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6296825408935547, "sampling/importance_sampling_ratio/mean": 1.000244140625, "sampling/importance_sampling_ratio/min": 0.6546962857246399, "sampling/sampling_logp_difference/max": 0.4883852005004883, "sampling/sampling_logp_difference/mean": 0.016528785228729248, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 211.953125, "completions/mean_terminated_length": 211.953125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.30486810207366943, "epoch": 0.07965686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.005054483798044397, "kl": 0.0006241274531930685, "learning_rate": 2.6016260162601625e-07, "loss": 0.0, "num_tokens": 1999547.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277800559997559, "sampling/importance_sampling_ratio/mean": 0.9995065927505493, "sampling/importance_sampling_ratio/min": 0.3745501935482025, "sampling/sampling_logp_difference/max": 0.9820294380187988, "sampling/sampling_logp_difference/mean": 0.013507379218935966, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 185.734375, "completions/mean_terminated_length": 185.734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4169462323188782, "epoch": 0.08088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.5179666940788312, "kl": 0.0006654216558672488, "learning_rate": 2.6422764227642274e-07, "loss": 0.0166, "num_tokens": 2038266.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.8034464120864868, "sampling/importance_sampling_ratio/mean": 0.9994356632232666, "sampling/importance_sampling_ratio/min": 0.5457186102867126, "sampling/sampling_logp_difference/max": 0.60565185546875, "sampling/sampling_logp_difference/mean": 0.017760932445526123, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 182.28125, "completions/mean_terminated_length": 182.28125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.36594781279563904, "epoch": 0.0821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.0225663216333327, "kl": 0.000694277579896152, "learning_rate": 2.682926829268293e-07, "loss": 0.0023, "num_tokens": 2065916.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.395979404449463, "sampling/importance_sampling_ratio/mean": 0.9999436736106873, "sampling/importance_sampling_ratio/min": 0.6172131299972534, "sampling/sampling_logp_difference/max": 0.4825408458709717, "sampling/sampling_logp_difference/mean": 0.016853000968694687, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.39735686779022217, "epoch": 0.08333333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.924838978895092, "kl": 0.00048797359340824187, "learning_rate": 2.7235772357723573e-07, "loss": 0.0183, "num_tokens": 2106668.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4861767292022705, "sampling/importance_sampling_ratio/mean": 0.9999603033065796, "sampling/importance_sampling_ratio/min": 0.5961363911628723, "sampling/sampling_logp_difference/max": 0.5172858238220215, "sampling/sampling_logp_difference/mean": 0.015098122879862785, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 204.203125, "completions/mean_terminated_length": 204.203125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.43868178129196167, "epoch": 0.08455882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.247102958162562, "kl": 0.0006902640452608466, "learning_rate": 2.764227642276423e-07, "loss": -0.0286, "num_tokens": 2136217.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4298206567764282, "sampling/importance_sampling_ratio/mean": 1.0002532005310059, "sampling/importance_sampling_ratio/min": 0.6179476380348206, "sampling/sampling_logp_difference/max": 0.4813516139984131, "sampling/sampling_logp_difference/mean": 0.018437707796692848, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 220.515625, "completions/mean_terminated_length": 220.515625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4509945511817932, "epoch": 0.0857843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.2776782296849163, "kl": 0.0005820897640660405, "learning_rate": 2.8048780487804877e-07, "loss": 0.0004, "num_tokens": 2172970.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6365644931793213, "sampling/importance_sampling_ratio/mean": 0.9998390674591064, "sampling/importance_sampling_ratio/min": 0.33703523874282837, "sampling/sampling_logp_difference/max": 1.0875678062438965, "sampling/sampling_logp_difference/mean": 0.017490090802311897, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 187.453125, "completions/mean_terminated_length": 187.453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.28403034806251526, "epoch": 0.08700980392156862, "frac_reward_zero_std": 1.0, "grad_norm": 0.003153047585758706, "kl": 0.0006190944695845246, "learning_rate": 2.8455284552845527e-07, "loss": 0.0, "num_tokens": 2200375.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.276409387588501, "sampling/importance_sampling_ratio/mean": 0.9995650053024292, "sampling/importance_sampling_ratio/min": 0.6188837289810181, "sampling/sampling_logp_difference/max": 0.47983789443969727, "sampling/sampling_logp_difference/mean": 0.013221720233559608, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 249.796875, "completions/mean_terminated_length": 249.796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4590471386909485, "epoch": 0.08823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2136650583335309, "kl": 0.0005434445920400321, "learning_rate": 2.886178861788618e-07, "loss": 0.0814, "num_tokens": 2232778.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3725579977035522, "sampling/importance_sampling_ratio/mean": 1.0002747774124146, "sampling/importance_sampling_ratio/min": 0.6689460277557373, "sampling/sampling_logp_difference/max": 0.4020519256591797, "sampling/sampling_logp_difference/mean": 0.016279827803373337, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 236.3125, "completions/mean_terminated_length": 236.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.41624367237091064, "epoch": 0.08946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9831733101434714, "kl": 0.0006176315946504474, "learning_rate": 2.9268292682926825e-07, "loss": -0.0187, "num_tokens": 2266718.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4574370384216309, "sampling/importance_sampling_ratio/mean": 1.0001189708709717, "sampling/importance_sampling_ratio/min": 0.5600042939186096, "sampling/sampling_logp_difference/max": 0.5798108577728271, "sampling/sampling_logp_difference/mean": 0.015832317993044853, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 169.28125, "completions/mean_terminated_length": 169.28125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2937129735946655, "epoch": 0.09068627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029593835234128824, "kl": 0.0006617329199798405, "learning_rate": 2.967479674796748e-07, "loss": 0.0, "num_tokens": 2290800.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6446852684020996, "sampling/importance_sampling_ratio/mean": 0.9997486472129822, "sampling/importance_sampling_ratio/min": 0.4876939654350281, "sampling/sampling_logp_difference/max": 0.7180671691894531, "sampling/sampling_logp_difference/mean": 0.014586403034627438, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 187.734375, "completions/mean_terminated_length": 187.734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.39507994055747986, "epoch": 0.09191176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.8095647807776131, "kl": 0.0006857198313809931, "learning_rate": 3.008130081300813e-07, "loss": 0.0074, "num_tokens": 2320239.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4050134420394897, "sampling/importance_sampling_ratio/mean": 0.9997504949569702, "sampling/importance_sampling_ratio/min": 0.6153008937835693, "sampling/sampling_logp_difference/max": 0.4856438636779785, "sampling/sampling_logp_difference/mean": 0.017259499058127403, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 170.671875, "completions/mean_terminated_length": 170.671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.27348703145980835, "epoch": 0.09313725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.004614003351554754, "kl": 0.0006742796977050602, "learning_rate": 3.048780487804878e-07, "loss": 0.0, "num_tokens": 2345514.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6229311227798462, "sampling/importance_sampling_ratio/mean": 1.0002658367156982, "sampling/importance_sampling_ratio/min": 0.6682140231132507, "sampling/sampling_logp_difference/max": 0.4842338562011719, "sampling/sampling_logp_difference/mean": 0.013882113620638847, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 213.09375, "completions/mean_terminated_length": 213.09375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.32055598497390747, "epoch": 0.09436274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.8590981531477943, "kl": 0.0005941680865362287, "learning_rate": 3.0894308943089434e-07, "loss": -0.0238, "num_tokens": 2376512.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4924719333648682, "sampling/importance_sampling_ratio/mean": 0.9996081590652466, "sampling/importance_sampling_ratio/min": 0.6148074865341187, "sampling/sampling_logp_difference/max": 0.4864461421966553, "sampling/sampling_logp_difference/mean": 0.014241337776184082, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 194.734375, "completions/mean_terminated_length": 194.734375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.35685548186302185, "epoch": 0.09558823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 0.97051460803959, "kl": 0.000702782766893506, "learning_rate": 3.130081300813008e-07, "loss": 0.0093, "num_tokens": 2403519.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.546732783317566, "sampling/importance_sampling_ratio/mean": 0.9990202188491821, "sampling/importance_sampling_ratio/min": 0.6093260049819946, "sampling/sampling_logp_difference/max": 0.49540185928344727, "sampling/sampling_logp_difference/mean": 0.016112372279167175, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 210.171875, "completions/mean_terminated_length": 210.171875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3866679072380066, "epoch": 0.09681372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 0.949837619775768, "kl": 0.0005758454790338874, "learning_rate": 3.170731707317073e-07, "loss": -0.0207, "num_tokens": 2438938.0, "reward": 0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5747621059417725, "sampling/importance_sampling_ratio/mean": 0.9996163249015808, "sampling/importance_sampling_ratio/min": 0.6254509687423706, "sampling/sampling_logp_difference/max": 0.4692823886871338, "sampling/sampling_logp_difference/mean": 0.015203858725726604, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 232.84375, "completions/mean_terminated_length": 232.84375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4193274974822998, "epoch": 0.09803921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.350014838850733, "kl": 0.0006424374878406525, "learning_rate": 3.211382113821138e-07, "loss": 0.012, "num_tokens": 2473376.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.637217402458191, "sampling/importance_sampling_ratio/mean": 1.000213861465454, "sampling/importance_sampling_ratio/min": 0.6215099096298218, "sampling/sampling_logp_difference/max": 0.4929981231689453, "sampling/sampling_logp_difference/mean": 0.016783427447080612, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.33013027906417847, "epoch": 0.09926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.8962667578457428, "kl": 0.0007790140807628632, "learning_rate": 3.252032520325203e-07, "loss": 0.0044, "num_tokens": 2500320.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.39943265914917, "sampling/importance_sampling_ratio/mean": 0.9993496537208557, "sampling/importance_sampling_ratio/min": 0.3412167429924011, "sampling/sampling_logp_difference/max": 1.0752373933792114, "sampling/sampling_logp_difference/mean": 0.016874413937330246, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 182.796875, "completions/mean_terminated_length": 182.796875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.347734659910202, "epoch": 0.10049019607843138, "frac_reward_zero_std": 0.75, "grad_norm": 1.092648006740298, "kl": 0.0006662920350208879, "learning_rate": 3.292682926829268e-07, "loss": -0.0194, "num_tokens": 2526803.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6284722089767456, "sampling/importance_sampling_ratio/mean": 1.0001274347305298, "sampling/importance_sampling_ratio/min": 0.6815561652183533, "sampling/sampling_logp_difference/max": 0.4876422882080078, "sampling/sampling_logp_difference/mean": 0.01490835938602686, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 180.65625, "completions/mean_terminated_length": 180.65625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3616575598716736, "epoch": 0.1017156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.003146758837083494, "kl": 0.0007405805517919362, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "num_tokens": 2552509.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6335662603378296, "sampling/importance_sampling_ratio/mean": 0.9998728036880493, "sampling/importance_sampling_ratio/min": 0.6259642839431763, "sampling/sampling_logp_difference/max": 0.4907655715942383, "sampling/sampling_logp_difference/mean": 0.016518738120794296, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3743305802345276, "epoch": 0.10294117647058823, "frac_reward_zero_std": 0.75, "grad_norm": 0.7807699925989458, "kl": 0.0005720094195567071, "learning_rate": 3.3739837398373985e-07, "loss": 0.0007, "num_tokens": 2589677.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.576317548751831, "sampling/importance_sampling_ratio/mean": 1.0004931688308716, "sampling/importance_sampling_ratio/min": 0.6033816933631897, "sampling/sampling_logp_difference/max": 0.5052052736282349, "sampling/sampling_logp_difference/mean": 0.01565365120768547, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4013028144836426, "epoch": 0.10416666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.9127288098095632, "kl": 0.0007912813453003764, "learning_rate": 3.4146341463414634e-07, "loss": -0.0118, "num_tokens": 2624283.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.480612874031067, "sampling/importance_sampling_ratio/mean": 0.9996029734611511, "sampling/importance_sampling_ratio/min": 0.6100897789001465, "sampling/sampling_logp_difference/max": 0.49414920806884766, "sampling/sampling_logp_difference/mean": 0.017500976100564003, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 162.78125, "completions/mean_terminated_length": 162.78125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3269077241420746, "epoch": 0.1053921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.0857299767643662, "kl": 0.0007406205404549837, "learning_rate": 3.4552845528455284e-07, "loss": 0.0118, "num_tokens": 2650253.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.500138521194458, "sampling/importance_sampling_ratio/mean": 1.000192642211914, "sampling/importance_sampling_ratio/min": 0.6159108877182007, "sampling/sampling_logp_difference/max": 0.4846529960632324, "sampling/sampling_logp_difference/mean": 0.015514223836362362, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 213.9375, "completions/mean_terminated_length": 213.9375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3853444457054138, "epoch": 0.10661764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.8887578822314383, "kl": 0.000650618749205023, "learning_rate": 3.4959349593495933e-07, "loss": 0.0139, "num_tokens": 2681929.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6184998750686646, "sampling/importance_sampling_ratio/mean": 1.0007580518722534, "sampling/importance_sampling_ratio/min": 0.6482299566268921, "sampling/sampling_logp_difference/max": 0.48149967193603516, "sampling/sampling_logp_difference/mean": 0.015420891344547272, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 179.578125, "completions/mean_terminated_length": 179.578125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.30991730093955994, "epoch": 0.10784313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.0912913059503908, "kl": 0.00088326825061813, "learning_rate": 3.536585365853658e-07, "loss": 0.0072, "num_tokens": 2708206.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5446949005126953, "sampling/importance_sampling_ratio/mean": 1.0001922845840454, "sampling/importance_sampling_ratio/min": 0.662237823009491, "sampling/sampling_logp_difference/max": 0.4348263740539551, "sampling/sampling_logp_difference/mean": 0.014573959633708, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2804887294769287, "epoch": 0.1090686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.004476988539064151, "kl": 0.0007030559936538339, "learning_rate": 3.5772357723577237e-07, "loss": 0.0, "num_tokens": 2739430.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5749434232711792, "sampling/importance_sampling_ratio/mean": 0.9996755123138428, "sampling/importance_sampling_ratio/min": 0.5952799916267395, "sampling/sampling_logp_difference/max": 0.5187234878540039, "sampling/sampling_logp_difference/mean": 0.012805728241801262, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 185.703125, "completions/mean_terminated_length": 185.703125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3637750744819641, "epoch": 0.11029411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 1.0987040629334006, "kl": 0.0008268561214208603, "learning_rate": 3.6178861788617886e-07, "loss": 0.013, "num_tokens": 2770115.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7029767036437988, "sampling/importance_sampling_ratio/mean": 0.9999949336051941, "sampling/importance_sampling_ratio/min": 0.5304797291755676, "sampling/sampling_logp_difference/max": 0.6339735984802246, "sampling/sampling_logp_difference/mean": 0.01635168120265007, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 225.59375, "completions/mean_terminated_length": 225.59375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.39361435174942017, "epoch": 0.11151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.003172991166124158, "kl": 0.0006379781407304108, "learning_rate": 3.6585365853658536e-07, "loss": 0.0, "num_tokens": 2803977.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4007288217544556, "sampling/importance_sampling_ratio/mean": 1.0001379251480103, "sampling/importance_sampling_ratio/min": 0.6248331665992737, "sampling/sampling_logp_difference/max": 0.47027063369750977, "sampling/sampling_logp_difference/mean": 0.015284635126590729, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 163.96875, "completions/mean_terminated_length": 163.96875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.28181523084640503, "epoch": 0.11274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.8625871157981916, "kl": 0.0008367709233425558, "learning_rate": 3.6991869918699185e-07, "loss": 0.0095, "num_tokens": 2827815.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.348543643951416, "sampling/importance_sampling_ratio/mean": 1.000134825706482, "sampling/importance_sampling_ratio/min": 0.6121360063552856, "sampling/sampling_logp_difference/max": 0.4908008575439453, "sampling/sampling_logp_difference/mean": 0.01428581029176712, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 247.796875, "completions/mean_terminated_length": 247.796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3397085666656494, "epoch": 0.11397058823529412, "frac_reward_zero_std": 0.75, "grad_norm": 0.8270845133939637, "kl": 0.0007522313389927149, "learning_rate": 3.7398373983739835e-07, "loss": -0.0304, "num_tokens": 2862634.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5790555477142334, "sampling/importance_sampling_ratio/mean": 1.0003384351730347, "sampling/importance_sampling_ratio/min": 0.6127174496650696, "sampling/sampling_logp_difference/max": 0.4898514747619629, "sampling/sampling_logp_difference/mean": 0.014473985880613327, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 153.296875, "completions/mean_terminated_length": 153.296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3286474049091339, "epoch": 0.11519607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.1208988814368832, "kl": 0.0012791944900527596, "learning_rate": 3.7804878048780484e-07, "loss": 0.0089, "num_tokens": 2886029.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.527778148651123, "sampling/importance_sampling_ratio/mean": 0.9993230104446411, "sampling/importance_sampling_ratio/min": 0.6139498353004456, "sampling/sampling_logp_difference/max": 0.4878420829772949, "sampling/sampling_logp_difference/mean": 0.016680724918842316, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 255.171875, "completions/mean_terminated_length": 255.171875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.35204946994781494, "epoch": 0.11642156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.1527361378746388, "kl": 0.0008182760793715715, "learning_rate": 3.821138211382114e-07, "loss": 0.0161, "num_tokens": 2921352.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5822359323501587, "sampling/importance_sampling_ratio/mean": 1.0001647472381592, "sampling/importance_sampling_ratio/min": 0.5632071495056152, "sampling/sampling_logp_difference/max": 0.5741077065467834, "sampling/sampling_logp_difference/mean": 0.014608575962483883, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 181.390625, "completions/mean_terminated_length": 181.390625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37239915132522583, "epoch": 0.11764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.005972359418865025, "kl": 0.0008802631637081504, "learning_rate": 3.861788617886179e-07, "loss": 0.0, "num_tokens": 2953889.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6360015869140625, "sampling/importance_sampling_ratio/mean": 1.0000799894332886, "sampling/importance_sampling_ratio/min": 0.3959225118160248, "sampling/sampling_logp_difference/max": 0.9265367984771729, "sampling/sampling_logp_difference/mean": 0.016587747260928154, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 188.203125, "completions/mean_terminated_length": 188.203125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.34458205103874207, "epoch": 0.11887254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.005579135570687071, "kl": 0.0009129910613410175, "learning_rate": 3.902439024390244e-07, "loss": 0.0, "num_tokens": 2982830.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4990748167037964, "sampling/importance_sampling_ratio/mean": 1.0001230239868164, "sampling/importance_sampling_ratio/min": 0.637395441532135, "sampling/sampling_logp_difference/max": 0.4503650665283203, "sampling/sampling_logp_difference/mean": 0.01516056526452303, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 176.8125, "completions/mean_terminated_length": 176.8125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.39370280504226685, "epoch": 0.12009803921568628, "frac_reward_zero_std": 0.5, "grad_norm": 1.3861097080427915, "kl": 0.0012123179621994495, "learning_rate": 3.9430894308943087e-07, "loss": -0.0181, "num_tokens": 3012642.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6179218292236328, "sampling/importance_sampling_ratio/mean": 1.000260829925537, "sampling/importance_sampling_ratio/min": 0.7118787169456482, "sampling/sampling_logp_difference/max": 0.481142520904541, "sampling/sampling_logp_difference/mean": 0.017914410680532455, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 183.265625, "completions/mean_terminated_length": 183.265625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.40355098247528076, "epoch": 0.1213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.0670426367198722, "kl": 0.0011470717145130038, "learning_rate": 3.9837398373983736e-07, "loss": 0.0129, "num_tokens": 3041123.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.545743703842163, "sampling/importance_sampling_ratio/mean": 1.0005805492401123, "sampling/importance_sampling_ratio/min": 0.6606181859970093, "sampling/sampling_logp_difference/max": 0.4355051517486572, "sampling/sampling_logp_difference/mean": 0.018123583868145943, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 246.734375, "completions/mean_terminated_length": 246.734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3937031626701355, "epoch": 0.12254901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 0.8196707546070746, "kl": 0.0010341871529817581, "learning_rate": 4.024390243902439e-07, "loss": 0.0152, "num_tokens": 3076018.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5291866064071655, "sampling/importance_sampling_ratio/mean": 1.0004017353057861, "sampling/importance_sampling_ratio/min": 0.6395527720451355, "sampling/sampling_logp_difference/max": 0.44698619842529297, "sampling/sampling_logp_difference/mean": 0.01549257431179285, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 239.59375, "completions/mean_terminated_length": 239.59375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4018251299858093, "epoch": 0.12377450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7982382817251258, "kl": 0.0009367854800075293, "learning_rate": 4.065040650406504e-07, "loss": 0.0088, "num_tokens": 3110232.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.650946021080017, "sampling/importance_sampling_ratio/mean": 0.9998608827590942, "sampling/importance_sampling_ratio/min": 0.6269895434379578, "sampling/sampling_logp_difference/max": 0.5013484954833984, "sampling/sampling_logp_difference/mean": 0.016505785286426544, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 204.421875, "completions/mean_terminated_length": 204.421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.32230591773986816, "epoch": 0.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.00750480542790826, "kl": 0.0011915290961042047, "learning_rate": 4.105691056910569e-07, "loss": 0.0, "num_tokens": 3142403.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5774788856506348, "sampling/importance_sampling_ratio/mean": 0.9997325539588928, "sampling/importance_sampling_ratio/min": 0.644951581954956, "sampling/sampling_logp_difference/max": 0.4558279514312744, "sampling/sampling_logp_difference/mean": 0.015833374112844467, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 167.09375, "completions/mean_terminated_length": 167.09375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.32066941261291504, "epoch": 0.12622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.008640263493909388, "kl": 0.001314917579293251, "learning_rate": 4.146341463414634e-07, "loss": 0.0, "num_tokens": 3173497.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4054081439971924, "sampling/importance_sampling_ratio/mean": 1.0004949569702148, "sampling/importance_sampling_ratio/min": 0.6147712469100952, "sampling/sampling_logp_difference/max": 0.48650503158569336, "sampling/sampling_logp_difference/mean": 0.015146872028708458, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 200.3125, "completions/mean_terminated_length": 200.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.38391387462615967, "epoch": 0.12745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.008062843336245079, "kl": 0.0012803412973880768, "learning_rate": 4.186991869918699e-07, "loss": 0.0, "num_tokens": 3204989.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3557255268096924, "sampling/importance_sampling_ratio/mean": 1.000128984451294, "sampling/importance_sampling_ratio/min": 0.6949819922447205, "sampling/sampling_logp_difference/max": 0.363869309425354, "sampling/sampling_logp_difference/mean": 0.0171358622610569, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 213.6875, "completions/mean_terminated_length": 213.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3612138032913208, "epoch": 0.12867647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.007391311454523405, "kl": 0.0011714284773916006, "learning_rate": 4.2276422764227643e-07, "loss": 0.0, "num_tokens": 3238041.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6307578086853027, "sampling/importance_sampling_ratio/mean": 1.0004353523254395, "sampling/importance_sampling_ratio/min": 0.6108908653259277, "sampling/sampling_logp_difference/max": 0.49283695220947266, "sampling/sampling_logp_difference/mean": 0.016306662932038307, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 189.0625, "completions/mean_terminated_length": 189.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3218452036380768, "epoch": 0.12990196078431374, "frac_reward_zero_std": 1.0, "grad_norm": 0.010263366656990456, "kl": 0.0015877524856477976, "learning_rate": 4.268292682926829e-07, "loss": 0.0, "num_tokens": 3265597.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5277644395828247, "sampling/importance_sampling_ratio/mean": 0.9993202090263367, "sampling/importance_sampling_ratio/min": 0.5364041328430176, "sampling/sampling_logp_difference/max": 0.6228674650192261, "sampling/sampling_logp_difference/mean": 0.016494762152433395, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 197.6875, "completions/mean_terminated_length": 197.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2655947804450989, "epoch": 0.13112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.006369894213665063, "kl": 0.0008513329667039216, "learning_rate": 4.308943089430894e-07, "loss": 0.0, "num_tokens": 3303545.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4051260948181152, "sampling/importance_sampling_ratio/mean": 0.9999285936355591, "sampling/importance_sampling_ratio/min": 0.6263183951377869, "sampling/sampling_logp_difference/max": 0.4678964614868164, "sampling/sampling_logp_difference/mean": 0.012676535174250603, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 191.03125, "completions/mean_terminated_length": 191.03125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.34833234548568726, "epoch": 0.1323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9575857032441931, "kl": 0.0016228670720010996, "learning_rate": 4.349593495934959e-07, "loss": -0.0078, "num_tokens": 3341883.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5313594341278076, "sampling/importance_sampling_ratio/mean": 1.000009298324585, "sampling/importance_sampling_ratio/min": 0.4878419041633606, "sampling/sampling_logp_difference/max": 0.7177639007568359, "sampling/sampling_logp_difference/mean": 0.01729895919561386, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 257.171875, "completions/mean_terminated_length": 257.171875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.41857966780662537, "epoch": 0.13357843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8022842118823669, "kl": 0.0011155225802212954, "learning_rate": 4.390243902439024e-07, "loss": 0.0205, "num_tokens": 3382614.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6637526750564575, "sampling/importance_sampling_ratio/mean": 0.9996939897537231, "sampling/importance_sampling_ratio/min": 0.62088942527771, "sampling/sampling_logp_difference/max": 0.5090756416320801, "sampling/sampling_logp_difference/mean": 0.017605653032660484, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 195.328125, "completions/mean_terminated_length": 195.328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3044811189174652, "epoch": 0.13480392156862744, "frac_reward_zero_std": 1.0, "grad_norm": 0.00782186742722866, "kl": 0.0011501931585371494, "learning_rate": 4.4308943089430896e-07, "loss": 0.0, "num_tokens": 3422267.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6345769166946411, "sampling/importance_sampling_ratio/mean": 0.9995689988136292, "sampling/importance_sampling_ratio/min": 0.595684289932251, "sampling/sampling_logp_difference/max": 0.5180444717407227, "sampling/sampling_logp_difference/mean": 0.014964728616178036, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 192.390625, "completions/mean_terminated_length": 192.390625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.2591615915298462, "epoch": 0.13602941176470587, "frac_reward_zero_std": 1.0, "grad_norm": 0.008764177979505195, "kl": 0.001395724480971694, "learning_rate": 4.471544715447154e-07, "loss": 0.0, "num_tokens": 3449172.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.620025634765625, "sampling/importance_sampling_ratio/mean": 1.0004347562789917, "sampling/importance_sampling_ratio/min": 0.6094062924385071, "sampling/sampling_logp_difference/max": 0.4952700138092041, "sampling/sampling_logp_difference/mean": 0.01283632405102253, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 216.203125, "completions/mean_terminated_length": 216.203125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3501753509044647, "epoch": 0.13725490196078433, "frac_reward_zero_std": 1.0, "grad_norm": 0.007326647429011458, "kl": 0.0012353763449937105, "learning_rate": 4.5121951219512194e-07, "loss": 0.0, "num_tokens": 3479073.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6356961727142334, "sampling/importance_sampling_ratio/mean": 1.0002961158752441, "sampling/importance_sampling_ratio/min": 0.6576474905014038, "sampling/sampling_logp_difference/max": 0.4920685291290283, "sampling/sampling_logp_difference/mean": 0.014653654769062996, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 249.59375, "completions/mean_terminated_length": 249.59375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.27881038188934326, "epoch": 0.13848039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063514451005622094, "kl": 0.0009215730242431164, "learning_rate": 4.5528455284552844e-07, "loss": 0.0, "num_tokens": 3516983.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.620772361755371, "sampling/importance_sampling_ratio/mean": 0.9997947812080383, "sampling/importance_sampling_ratio/min": 0.6486902832984924, "sampling/sampling_logp_difference/max": 0.48290276527404785, "sampling/sampling_logp_difference/mean": 0.012355177663266659, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 174.09375, "completions/mean_terminated_length": 174.09375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3457588851451874, "epoch": 0.13970588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.9080781021638189, "kl": 0.0013797450810670853, "learning_rate": 4.5934959349593493e-07, "loss": 0.0197, "num_tokens": 3545421.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.406604528427124, "sampling/importance_sampling_ratio/mean": 1.000349760055542, "sampling/importance_sampling_ratio/min": 0.662269115447998, "sampling/sampling_logp_difference/max": 0.4120832681655884, "sampling/sampling_logp_difference/mean": 0.015144167467951775, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 160.734375, "completions/mean_terminated_length": 160.734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.22996263206005096, "epoch": 0.1409313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.010243953254923614, "kl": 0.0013740160502493382, "learning_rate": 4.634146341463415e-07, "loss": 0.0, "num_tokens": 3571180.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5758028030395508, "sampling/importance_sampling_ratio/mean": 0.9999841451644897, "sampling/importance_sampling_ratio/min": 0.6630354523658752, "sampling/sampling_logp_difference/max": 0.45476484298706055, "sampling/sampling_logp_difference/mean": 0.012498574331402779, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.33914482593536377, "epoch": 0.14215686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.008385369656932104, "kl": 0.001285051228478551, "learning_rate": 4.674796747967479e-07, "loss": 0.0, "num_tokens": 3606984.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5045294761657715, "sampling/importance_sampling_ratio/mean": 1.0004189014434814, "sampling/importance_sampling_ratio/min": 0.6098470687866211, "sampling/sampling_logp_difference/max": 0.49454712867736816, "sampling/sampling_logp_difference/mean": 0.015076465904712677, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 229.140625, "completions/mean_terminated_length": 229.140625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.30870968103408813, "epoch": 0.14338235294117646, "frac_reward_zero_std": 0.75, "grad_norm": 0.7474120105497813, "kl": 0.0013130044098943472, "learning_rate": 4.7154471544715447e-07, "loss": 0.0243, "num_tokens": 3640401.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000335454940796, "sampling/importance_sampling_ratio/min": 0.6410284638404846, "sampling/sampling_logp_difference/max": 0.7243653535842896, "sampling/sampling_logp_difference/mean": 0.013415869325399399, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 245.890625, "completions/mean_terminated_length": 245.890625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.28496605157852173, "epoch": 0.14460784313725492, "frac_reward_zero_std": 0.75, "grad_norm": 0.7489191941362603, "kl": 0.0012819442199543118, "learning_rate": 4.756097560975609e-07, "loss": 0.0032, "num_tokens": 3675018.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5749266147613525, "sampling/importance_sampling_ratio/mean": 0.9996001720428467, "sampling/importance_sampling_ratio/min": 0.6194509267807007, "sampling/sampling_logp_difference/max": 0.47892189025878906, "sampling/sampling_logp_difference/mean": 0.01227761059999466, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 210.203125, "completions/mean_terminated_length": 210.203125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.41157370805740356, "epoch": 0.14583333333333334, "frac_reward_zero_std": 0.5, "grad_norm": 1.2819404259096743, "kl": 0.0021230392158031464, "learning_rate": 4.796747967479675e-07, "loss": 0.0261, "num_tokens": 3702663.0, "reward": -0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.448575496673584, "sampling/importance_sampling_ratio/mean": 0.9992985725402832, "sampling/importance_sampling_ratio/min": 0.5025063157081604, "sampling/sampling_logp_difference/max": 0.6881470680236816, "sampling/sampling_logp_difference/mean": 0.017737586051225662, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 198.546875, "completions/mean_terminated_length": 198.546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3729953169822693, "epoch": 0.14705882352941177, "frac_reward_zero_std": 0.5, "grad_norm": 1.3678292203558666, "kl": 0.002271226141601801, "learning_rate": 4.83739837398374e-07, "loss": -0.031, "num_tokens": 3730858.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5564074516296387, "sampling/importance_sampling_ratio/mean": 1.000110149383545, "sampling/importance_sampling_ratio/min": 0.6549456715583801, "sampling/sampling_logp_difference/max": 0.4423801898956299, "sampling/sampling_logp_difference/mean": 0.01663437858223915, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 218.671875, "completions/mean_terminated_length": 218.671875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.42304205894470215, "epoch": 0.1482843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9284858253703636, "kl": 0.0015721892705187201, "learning_rate": 4.878048780487804e-07, "loss": 0.0088, "num_tokens": 3768037.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4394360780715942, "sampling/importance_sampling_ratio/mean": 1.0001715421676636, "sampling/importance_sampling_ratio/min": 0.6298869848251343, "sampling/sampling_logp_difference/max": 0.46221494674682617, "sampling/sampling_logp_difference/mean": 0.018140073865652084, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 214.03125, "completions/mean_terminated_length": 214.03125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4136981666088104, "epoch": 0.14950980392156862, "frac_reward_zero_std": 0.75, "grad_norm": 0.8067982302135566, "kl": 0.001777131692506373, "learning_rate": 4.91869918699187e-07, "loss": -0.0021, "num_tokens": 3801895.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8919440507888794, "sampling/importance_sampling_ratio/mean": 0.9996967911720276, "sampling/importance_sampling_ratio/min": 0.3235868215560913, "sampling/sampling_logp_difference/max": 1.1282877922058105, "sampling/sampling_logp_difference/mean": 0.018149856477975845, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 164.8125, "completions/mean_terminated_length": 164.8125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.35000666975975037, "epoch": 0.15073529411764705, "frac_reward_zero_std": 0.75, "grad_norm": 0.9843246113783801, "kl": 0.0021282376255840063, "learning_rate": 4.959349593495934e-07, "loss": -0.0143, "num_tokens": 3826283.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4718385934829712, "sampling/importance_sampling_ratio/mean": 1.0010018348693848, "sampling/importance_sampling_ratio/min": 0.655526876449585, "sampling/sampling_logp_difference/max": 0.42231595516204834, "sampling/sampling_logp_difference/mean": 0.01719508320093155, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 218.109375, "completions/mean_terminated_length": 218.109375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.25339066982269287, "epoch": 0.15196078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.008079150541077773, "kl": 0.0012523139594122767, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 3868402.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4768990278244019, "sampling/importance_sampling_ratio/mean": 1.0000702142715454, "sampling/importance_sampling_ratio/min": 0.6396217942237854, "sampling/sampling_logp_difference/max": 0.4468783140182495, "sampling/sampling_logp_difference/mean": 0.012863853946328163, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 173.890625, "completions/mean_terminated_length": 173.890625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3082340657711029, "epoch": 0.15318627450980393, "frac_reward_zero_std": 0.75, "grad_norm": 1.0185918635434672, "kl": 0.0016419864259660244, "learning_rate": 5.040650406504064e-07, "loss": -0.007, "num_tokens": 3898715.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6783421039581299, "sampling/importance_sampling_ratio/mean": 1.0000501871109009, "sampling/importance_sampling_ratio/min": 0.19109542667865753, "sampling/sampling_logp_difference/max": 1.654982328414917, "sampling/sampling_logp_difference/mean": 0.014716839417815208, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 186.453125, "completions/mean_terminated_length": 186.453125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3567272424697876, "epoch": 0.15441176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 1.3837709451569948, "kl": 0.0018109779339283705, "learning_rate": 5.081300813008131e-07, "loss": 0.0437, "num_tokens": 3927912.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7580591440200806, "sampling/importance_sampling_ratio/mean": 1.000162959098816, "sampling/importance_sampling_ratio/min": 0.6622379422187805, "sampling/sampling_logp_difference/max": 0.5642104148864746, "sampling/sampling_logp_difference/mean": 0.014668013900518417, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 166.78125, "completions/mean_terminated_length": 166.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30008426308631897, "epoch": 0.1556372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.014836222650616791, "kl": 0.0022569410502910614, "learning_rate": 5.121951219512195e-07, "loss": 0.0, "num_tokens": 3954170.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 1.000068187713623, "sampling/importance_sampling_ratio/min": 0.613304078578949, "sampling/sampling_logp_difference/max": 0.4888944625854492, "sampling/sampling_logp_difference/mean": 0.014907223172485828, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 157.515625, "completions/mean_terminated_length": 157.515625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.32169437408447266, "epoch": 0.1568627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.014807923597855412, "kl": 0.002098316326737404, "learning_rate": 5.16260162601626e-07, "loss": 0.0, "num_tokens": 3980747.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5191376209259033, "sampling/importance_sampling_ratio/mean": 0.9997336864471436, "sampling/importance_sampling_ratio/min": 0.6103330254554749, "sampling/sampling_logp_difference/max": 0.49375057220458984, "sampling/sampling_logp_difference/mean": 0.016123417764902115, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.29940271377563477, "epoch": 0.15808823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 1.0916190885188615, "kl": 0.001945118885487318, "learning_rate": 5.203252032520325e-07, "loss": 0.04, "num_tokens": 4016639.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4510223865509033, "sampling/importance_sampling_ratio/mean": 0.9998721480369568, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.014073947444558144, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 195.046875, "completions/mean_terminated_length": 195.046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.30529260635375977, "epoch": 0.15931372549019607, "frac_reward_zero_std": 0.75, "grad_norm": 0.8566925567928513, "kl": 0.00208301586098969, "learning_rate": 5.24390243902439e-07, "loss": 0.0082, "num_tokens": 4045778.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008527040481567, "sampling/importance_sampling_ratio/min": 0.6134491562843323, "sampling/sampling_logp_difference/max": 0.698920488357544, "sampling/sampling_logp_difference/mean": 0.013193566352128983, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 148.796875, "completions/mean_terminated_length": 148.796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.28194862604141235, "epoch": 0.16053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.01877514514420134, "kl": 0.002552943304181099, "learning_rate": 5.284552845528455e-07, "loss": 0.0, "num_tokens": 4073733.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.578508973121643, "sampling/importance_sampling_ratio/mean": 1.0002617835998535, "sampling/importance_sampling_ratio/min": 0.3664003014564514, "sampling/sampling_logp_difference/max": 1.0040287971496582, "sampling/sampling_logp_difference/mean": 0.014787127263844013, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 154.109375, "completions/mean_terminated_length": 154.109375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2850548326969147, "epoch": 0.16176470588235295, "frac_reward_zero_std": 0.75, "grad_norm": 1.1842928599356437, "kl": 0.0039580995216965675, "learning_rate": 5.325203252032519e-07, "loss": -0.0018, "num_tokens": 4100572.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4497106075286865, "sampling/importance_sampling_ratio/mean": 0.9997336864471436, "sampling/importance_sampling_ratio/min": 0.6338287591934204, "sampling/sampling_logp_difference/max": 0.4559764862060547, "sampling/sampling_logp_difference/mean": 0.015777725726366043, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 146.546875, "completions/mean_terminated_length": 146.546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.35511481761932373, "epoch": 0.16299019607843138, "frac_reward_zero_std": 1.0, "grad_norm": 0.024482506909237974, "kl": 0.0033495896495878696, "learning_rate": 5.365853658536586e-07, "loss": 0.0, "num_tokens": 4129567.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6090610027313232, "sampling/importance_sampling_ratio/mean": 1.0000367164611816, "sampling/importance_sampling_ratio/min": 0.6313496828079224, "sampling/sampling_logp_difference/max": 0.4756507873535156, "sampling/sampling_logp_difference/mean": 0.0187184177339077, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.34629660844802856, "epoch": 0.1642156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.022087950599703535, "kl": 0.002930707298219204, "learning_rate": 5.40650406504065e-07, "loss": 0.0, "num_tokens": 4157059.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.615315556526184, "sampling/importance_sampling_ratio/mean": 0.9998091459274292, "sampling/importance_sampling_ratio/min": 0.6301627159118652, "sampling/sampling_logp_difference/max": 0.47953033447265625, "sampling/sampling_logp_difference/mean": 0.015880992636084557, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 191.1875, "completions/mean_terminated_length": 191.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3255227208137512, "epoch": 0.16544117647058823, "frac_reward_zero_std": 0.75, "grad_norm": 1.116328162716102, "kl": 0.002989242784678936, "learning_rate": 5.447154471544715e-07, "loss": -0.0028, "num_tokens": 4191519.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6071367263793945, "sampling/importance_sampling_ratio/mean": 1.0001487731933594, "sampling/importance_sampling_ratio/min": 0.6103999614715576, "sampling/sampling_logp_difference/max": 0.4936408996582031, "sampling/sampling_logp_difference/mean": 0.015889611095190048, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 203.546875, "completions/mean_terminated_length": 203.546875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3820006251335144, "epoch": 0.16666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02492721865244265, "kl": 0.0034382841549813747, "learning_rate": 5.487804878048781e-07, "loss": 0.0, "num_tokens": 4226802.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8545461893081665, "sampling/importance_sampling_ratio/mean": 1.000016212463379, "sampling/importance_sampling_ratio/min": 0.6229522228240967, "sampling/sampling_logp_difference/max": 0.6176400184631348, "sampling/sampling_logp_difference/mean": 0.017170652747154236, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28024011850357056, "epoch": 0.16789215686274508, "frac_reward_zero_std": 1.0, "grad_norm": 0.019821679154392032, "kl": 0.002866612747311592, "learning_rate": 5.528455284552846e-07, "loss": 0.0, "num_tokens": 4255186.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.581290364265442, "sampling/importance_sampling_ratio/mean": 1.0009236335754395, "sampling/importance_sampling_ratio/min": 0.6172630190849304, "sampling/sampling_logp_difference/max": 0.48246002197265625, "sampling/sampling_logp_difference/mean": 0.013574345037341118, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 206.515625, "completions/mean_terminated_length": 206.515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3306284546852112, "epoch": 0.16911764705882354, "frac_reward_zero_std": 0.75, "grad_norm": 1.022713078027099, "kl": 0.003378923051059246, "learning_rate": 5.56910569105691e-07, "loss": -0.0052, "num_tokens": 4285667.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4726227521896362, "sampling/importance_sampling_ratio/mean": 1.0002403259277344, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.0152666587382555, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 199.21875, "completions/mean_terminated_length": 199.21875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3423955738544464, "epoch": 0.17034313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.9941180025099868, "kl": 0.004970120266079903, "learning_rate": 5.609756097560975e-07, "loss": 0.0357, "num_tokens": 4315665.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4934946298599243, "sampling/importance_sampling_ratio/mean": 1.0001451969146729, "sampling/importance_sampling_ratio/min": 0.3357149362564087, "sampling/sampling_logp_difference/max": 1.0914928913116455, "sampling/sampling_logp_difference/mean": 0.01607022061944008, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 172.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.31557244062423706, "epoch": 0.1715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0323187840939241, "kl": 0.005174246151000261, "learning_rate": 5.650406504065041e-07, "loss": 0.0, "num_tokens": 4346517.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.621016263961792, "sampling/importance_sampling_ratio/mean": 0.9998432993888855, "sampling/importance_sampling_ratio/min": 0.6001259088516235, "sampling/sampling_logp_difference/max": 0.5106158256530762, "sampling/sampling_logp_difference/mean": 0.01690821535885334, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 164.578125, "completions/mean_terminated_length": 164.578125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23700770735740662, "epoch": 0.17279411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02986836784298849, "kl": 0.004557392559945583, "learning_rate": 5.691056910569105e-07, "loss": 0.0, "num_tokens": 4372026.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5181050300598145, "sampling/importance_sampling_ratio/mean": 0.9995629787445068, "sampling/importance_sampling_ratio/min": 0.6191670894622803, "sampling/sampling_logp_difference/max": 0.47938013076782227, "sampling/sampling_logp_difference/mean": 0.013194214552640915, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3119884729385376, "epoch": 0.17401960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.02426669659002909, "kl": 0.004346243105828762, "learning_rate": 5.73170731707317e-07, "loss": 0.0, "num_tokens": 4401538.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6281784772872925, "sampling/importance_sampling_ratio/mean": 1.0002410411834717, "sampling/importance_sampling_ratio/min": 0.639254093170166, "sampling/sampling_logp_difference/max": 0.4874619245529175, "sampling/sampling_logp_difference/mean": 0.01620330847799778, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2875828742980957, "epoch": 0.17524509803921567, "frac_reward_zero_std": 1.0, "grad_norm": 0.028400871307893368, "kl": 0.004842736292630434, "learning_rate": 5.772357723577236e-07, "loss": 0.0, "num_tokens": 4427338.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5251868963241577, "sampling/importance_sampling_ratio/mean": 1.0000712871551514, "sampling/importance_sampling_ratio/min": 0.680833637714386, "sampling/sampling_logp_difference/max": 0.4221169948577881, "sampling/sampling_logp_difference/mean": 0.015635253861546516, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 185.21875, "completions/mean_terminated_length": 185.21875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.32663196325302124, "epoch": 0.17647058823529413, "frac_reward_zero_std": 0.75, "grad_norm": 1.3266612292106221, "kl": 0.005485333502292633, "learning_rate": 5.813008130081301e-07, "loss": 0.0223, "num_tokens": 4466200.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5157999992370605, "sampling/importance_sampling_ratio/mean": 0.9999676942825317, "sampling/importance_sampling_ratio/min": 0.41252991557121277, "sampling/sampling_logp_difference/max": 0.8854465484619141, "sampling/sampling_logp_difference/mean": 0.01599922403693199, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 161.515625, "completions/mean_terminated_length": 161.515625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3241770267486572, "epoch": 0.17769607843137256, "frac_reward_zero_std": 0.75, "grad_norm": 1.0026817610712315, "kl": 0.005243861116468906, "learning_rate": 5.853658536585365e-07, "loss": -0.005, "num_tokens": 4493977.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.60780668258667, "sampling/importance_sampling_ratio/mean": 1.0003927946090698, "sampling/importance_sampling_ratio/min": 0.6585583686828613, "sampling/sampling_logp_difference/max": 0.4748709201812744, "sampling/sampling_logp_difference/mean": 0.015489340759813786, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 218.15625, "completions/mean_terminated_length": 218.15625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27188339829444885, "epoch": 0.17892156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.037233272570007664, "kl": 0.005310396663844585, "learning_rate": 5.894308943089431e-07, "loss": 0.0001, "num_tokens": 4530899.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5263862609863281, "sampling/importance_sampling_ratio/mean": 1.00016188621521, "sampling/importance_sampling_ratio/min": 0.6371117830276489, "sampling/sampling_logp_difference/max": 0.45081019401550293, "sampling/sampling_logp_difference/mean": 0.013202058151364326, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 199.078125, "completions/mean_terminated_length": 199.078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3040648102760315, "epoch": 0.1801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.0245297915998763, "kl": 0.004978617187589407, "learning_rate": 5.934959349593496e-07, "loss": -0.0188, "num_tokens": 4559208.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.7152482271194458, "sampling/importance_sampling_ratio/mean": 1.0001599788665771, "sampling/importance_sampling_ratio/min": 0.6121216416358948, "sampling/sampling_logp_difference/max": 0.5395578145980835, "sampling/sampling_logp_difference/mean": 0.015116818249225616, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3684394359588623, "epoch": 0.18137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.020667760502508, "kl": 0.004764545243233442, "learning_rate": 5.97560975609756e-07, "loss": -0.0077, "num_tokens": 4590242.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.537596344947815, "sampling/importance_sampling_ratio/mean": 0.9995277523994446, "sampling/importance_sampling_ratio/min": 0.6257607340812683, "sampling/sampling_logp_difference/max": 0.46878719329833984, "sampling/sampling_logp_difference/mean": 0.016859525814652443, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 188.484375, "completions/mean_terminated_length": 188.484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.292277991771698, "epoch": 0.18259803921568626, "frac_reward_zero_std": 1.0, "grad_norm": 0.03224063838537423, "kl": 0.00572782289236784, "learning_rate": 6.016260162601626e-07, "loss": 0.0001, "num_tokens": 4622033.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3439899682998657, "sampling/importance_sampling_ratio/mean": 0.9993945360183716, "sampling/importance_sampling_ratio/min": 0.6064043045043945, "sampling/sampling_logp_difference/max": 0.5002083778381348, "sampling/sampling_logp_difference/mean": 0.015040220692753792, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 156.171875, "completions/mean_terminated_length": 156.171875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.27697500586509705, "epoch": 0.18382352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 1.1502898286460446, "kl": 0.0055986312218010426, "learning_rate": 6.056910569105691e-07, "loss": -0.0426, "num_tokens": 4646828.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00044584274292, "sampling/importance_sampling_ratio/min": 0.6170914769172668, "sampling/sampling_logp_difference/max": 0.7287707328796387, "sampling/sampling_logp_difference/mean": 0.01559227705001831, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.42049098014831543, "epoch": 0.18504901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.362654332945133, "kl": 0.005915883928537369, "learning_rate": 6.097560975609756e-07, "loss": -0.0603, "num_tokens": 4682300.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5822070837020874, "sampling/importance_sampling_ratio/mean": 0.9996191263198853, "sampling/importance_sampling_ratio/min": 0.6361913084983826, "sampling/sampling_logp_difference/max": 0.45882081985473633, "sampling/sampling_logp_difference/mean": 0.016376720741391182, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 167.734375, "completions/mean_terminated_length": 167.734375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3233358561992645, "epoch": 0.18627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.029677670044690574, "kl": 0.0062215314246714115, "learning_rate": 6.13821138211382e-07, "loss": 0.0001, "num_tokens": 4713531.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5205823183059692, "sampling/importance_sampling_ratio/mean": 0.9996426701545715, "sampling/importance_sampling_ratio/min": 0.6058167219161987, "sampling/sampling_logp_difference/max": 0.5011777877807617, "sampling/sampling_logp_difference/mean": 0.01610369235277176, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 181.609375, "completions/mean_terminated_length": 181.609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.26142436265945435, "epoch": 0.1875, "frac_reward_zero_std": 1.0, "grad_norm": 0.021068254107589144, "kl": 0.004076074808835983, "learning_rate": 6.178861788617887e-07, "loss": 0.0, "num_tokens": 4741794.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4891793727874756, "sampling/importance_sampling_ratio/mean": 0.9995283484458923, "sampling/importance_sampling_ratio/min": 0.6404410004615784, "sampling/sampling_logp_difference/max": 0.4455982446670532, "sampling/sampling_logp_difference/mean": 0.012542849406599998, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 240.890625, "completions/mean_terminated_length": 240.890625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.32881855964660645, "epoch": 0.18872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01738556994587932, "kl": 0.003266222309321165, "learning_rate": 6.219512195121951e-07, "loss": 0.0, "num_tokens": 4776843.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5459238290786743, "sampling/importance_sampling_ratio/mean": 0.9998267889022827, "sampling/importance_sampling_ratio/min": 0.1480797976255417, "sampling/sampling_logp_difference/max": 1.9100040197372437, "sampling/sampling_logp_difference/mean": 0.014897173270583153, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 178.6875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3043786287307739, "epoch": 0.18995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.21318191034155, "kl": 0.0040686968713998795, "learning_rate": 6.260162601626016e-07, "loss": -0.0486, "num_tokens": 4804375.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6132657527923584, "sampling/importance_sampling_ratio/mean": 1.000281572341919, "sampling/importance_sampling_ratio/min": 0.6176195740699768, "sampling/sampling_logp_difference/max": 0.48188257217407227, "sampling/sampling_logp_difference/mean": 0.014598002657294273, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28690484166145325, "epoch": 0.19117647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.02361950208598998, "kl": 0.004340208135545254, "learning_rate": 6.300813008130081e-07, "loss": 0.0, "num_tokens": 4834823.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4644800424575806, "sampling/importance_sampling_ratio/mean": 0.9994724988937378, "sampling/importance_sampling_ratio/min": 0.6098414659500122, "sampling/sampling_logp_difference/max": 0.4945563077926636, "sampling/sampling_logp_difference/mean": 0.014058542437851429, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 176.578125, "completions/mean_terminated_length": 176.578125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3075827956199646, "epoch": 0.19240196078431374, "frac_reward_zero_std": 1.0, "grad_norm": 0.016886477020967572, "kl": 0.003711746772751212, "learning_rate": 6.341463414634146e-07, "loss": 0.0, "num_tokens": 4866460.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5459957122802734, "sampling/importance_sampling_ratio/mean": 1.000352144241333, "sampling/importance_sampling_ratio/min": 0.6635557413101196, "sampling/sampling_logp_difference/max": 0.4356682300567627, "sampling/sampling_logp_difference/mean": 0.014879366382956505, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 160.734375, "completions/mean_terminated_length": 160.734375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.29727303981781006, "epoch": 0.19362745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.018378535930924606, "kl": 0.00456765852868557, "learning_rate": 6.382113821138211e-07, "loss": 0.0, "num_tokens": 4891451.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9994612336158752, "sampling/importance_sampling_ratio/min": 0.5615734457969666, "sampling/sampling_logp_difference/max": 0.5770127773284912, "sampling/sampling_logp_difference/mean": 0.014178035780787468, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 208.546875, "completions/mean_terminated_length": 208.546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.39588308334350586, "epoch": 0.1948529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.016381547211185944, "kl": 0.00348066003061831, "learning_rate": 6.422764227642276e-07, "loss": 0.0, "num_tokens": 4920494.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3604146242141724, "sampling/importance_sampling_ratio/mean": 1.0002737045288086, "sampling/importance_sampling_ratio/min": 0.6240179538726807, "sampling/sampling_logp_difference/max": 0.4715762138366699, "sampling/sampling_logp_difference/mean": 0.017580877989530563, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 196.703125, "completions/mean_terminated_length": 196.703125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.41126543283462524, "epoch": 0.19607843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9591859348671818, "kl": 0.0034652426838874817, "learning_rate": 6.463414634146342e-07, "loss": -0.0195, "num_tokens": 4952715.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4795540571212769, "sampling/importance_sampling_ratio/mean": 1.0006828308105469, "sampling/importance_sampling_ratio/min": 0.6171398758888245, "sampling/sampling_logp_difference/max": 0.48265957832336426, "sampling/sampling_logp_difference/mean": 0.017289428040385246, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 197.109375, "completions/mean_terminated_length": 197.109375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.34364861249923706, "epoch": 0.19730392156862744, "frac_reward_zero_std": 1.0, "grad_norm": 0.01722778244946955, "kl": 0.004390948452055454, "learning_rate": 6.504065040650406e-07, "loss": 0.0, "num_tokens": 4982434.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999923825263977, "sampling/importance_sampling_ratio/min": 0.6132733821868896, "sampling/sampling_logp_difference/max": 0.9893083572387695, "sampling/sampling_logp_difference/mean": 0.01679658517241478, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 218.671875, "completions/mean_terminated_length": 218.671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3521959185600281, "epoch": 0.19852941176470587, "frac_reward_zero_std": 0.75, "grad_norm": 0.8587819363528818, "kl": 0.0034514947328716516, "learning_rate": 6.544715447154471e-07, "loss": -0.0562, "num_tokens": 5015149.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.433842658996582, "sampling/importance_sampling_ratio/mean": 0.9994710683822632, "sampling/importance_sampling_ratio/min": 0.49541690945625305, "sampling/sampling_logp_difference/max": 0.7023556232452393, "sampling/sampling_logp_difference/mean": 0.01686188578605652, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2904425263404846, "epoch": 0.19975490196078433, "frac_reward_zero_std": 1.0, "grad_norm": 0.012973321770188678, "kl": 0.003004253376275301, "learning_rate": 6.585365853658536e-07, "loss": 0.0, "num_tokens": 5047293.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4284110069274902, "sampling/importance_sampling_ratio/mean": 1.0002599954605103, "sampling/importance_sampling_ratio/min": 0.6202362179756165, "sampling/sampling_logp_difference/max": 0.47765493392944336, "sampling/sampling_logp_difference/mean": 0.013213622383773327, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 224.828125, "completions/mean_terminated_length": 224.828125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2878662049770355, "epoch": 0.20098039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.012719596988869973, "kl": 0.0029752333648502827, "learning_rate": 6.626016260162602e-07, "loss": 0.0, "num_tokens": 5094402.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4756325483322144, "sampling/importance_sampling_ratio/mean": 1.0003304481506348, "sampling/importance_sampling_ratio/min": 0.5676484704017639, "sampling/sampling_logp_difference/max": 0.5662529468536377, "sampling/sampling_logp_difference/mean": 0.013716815039515495, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 217.4375, "completions/mean_terminated_length": 217.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.37395769357681274, "epoch": 0.20220588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.8767704887704821, "kl": 0.0044336821883916855, "learning_rate": 6.666666666666666e-07, "loss": 0.0524, "num_tokens": 5126030.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.525197148323059, "sampling/importance_sampling_ratio/mean": 1.0002803802490234, "sampling/importance_sampling_ratio/min": 0.6158265471458435, "sampling/sampling_logp_difference/max": 0.4847898483276367, "sampling/sampling_logp_difference/mean": 0.015268450602889061, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 168.71875, "completions/mean_terminated_length": 168.71875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3771747946739197, "epoch": 0.2034313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01791208693210655, "kl": 0.004233954939991236, "learning_rate": 6.707317073170731e-07, "loss": 0.0, "num_tokens": 5152300.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3672889471054077, "sampling/importance_sampling_ratio/mean": 0.9996294975280762, "sampling/importance_sampling_ratio/min": 0.6267502307891846, "sampling/sampling_logp_difference/max": 0.4672071933746338, "sampling/sampling_logp_difference/mean": 0.016697824001312256, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 203.796875, "completions/mean_terminated_length": 203.796875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3788134455680847, "epoch": 0.20465686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0157473432980807, "kl": 0.004376332275569439, "learning_rate": 6.747967479674797e-07, "loss": 0.0, "num_tokens": 5182671.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.585753321647644, "sampling/importance_sampling_ratio/mean": 1.0001531839370728, "sampling/importance_sampling_ratio/min": 0.6228065490722656, "sampling/sampling_logp_difference/max": 0.47351932525634766, "sampling/sampling_logp_difference/mean": 0.015365363098680973, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 217.5625, "completions/mean_terminated_length": 217.5625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.46515071392059326, "epoch": 0.20588235294117646, "frac_reward_zero_std": 0.5, "grad_norm": 1.2459167809280487, "kl": 0.003541711252182722, "learning_rate": 6.788617886178861e-07, "loss": 0.0037, "num_tokens": 5214211.0, "reward": -0.03125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.435779333114624, "sampling/importance_sampling_ratio/mean": 1.0002970695495605, "sampling/importance_sampling_ratio/min": 0.3252015709877014, "sampling/sampling_logp_difference/max": 1.1233100891113281, "sampling/sampling_logp_difference/mean": 0.017395062372088432, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 244.296875, "completions/mean_terminated_length": 244.296875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.35924825072288513, "epoch": 0.20710784313725492, "frac_reward_zero_std": 1.0, "grad_norm": 0.012788952296011735, "kl": 0.003738091792911291, "learning_rate": 6.829268292682927e-07, "loss": 0.0, "num_tokens": 5247734.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5637272596359253, "sampling/importance_sampling_ratio/mean": 1.0001357793807983, "sampling/importance_sampling_ratio/min": 0.5928646326065063, "sampling/sampling_logp_difference/max": 0.5227892398834229, "sampling/sampling_logp_difference/mean": 0.015229095704853535, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 219.46875, "completions/mean_terminated_length": 219.46875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3450343608856201, "epoch": 0.20833333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.9312880034856267, "kl": 0.0034308377653360367, "learning_rate": 6.869918699186991e-07, "loss": -0.0048, "num_tokens": 5282340.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4391900300979614, "sampling/importance_sampling_ratio/mean": 0.999631404876709, "sampling/importance_sampling_ratio/min": 0.546565592288971, "sampling/sampling_logp_difference/max": 0.6041009426116943, "sampling/sampling_logp_difference/mean": 0.016199318692088127, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 177.84375, "completions/mean_terminated_length": 177.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.386122465133667, "epoch": 0.20955882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.017727940835066208, "kl": 0.003923412412405014, "learning_rate": 6.910569105691057e-07, "loss": 0.0, "num_tokens": 5311642.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4683748483657837, "sampling/importance_sampling_ratio/mean": 0.9991652369499207, "sampling/importance_sampling_ratio/min": 0.6329178810119629, "sampling/sampling_logp_difference/max": 0.4574146270751953, "sampling/sampling_logp_difference/mean": 0.01795331947505474, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 234.734375, "completions/mean_terminated_length": 234.734375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.33944404125213623, "epoch": 0.2107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.7468689868955847, "kl": 0.0033471607603132725, "learning_rate": 6.951219512195121e-07, "loss": -0.0085, "num_tokens": 5348041.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.598752737045288, "sampling/importance_sampling_ratio/mean": 0.9998255372047424, "sampling/importance_sampling_ratio/min": 0.637968897819519, "sampling/sampling_logp_difference/max": 0.4692237377166748, "sampling/sampling_logp_difference/mean": 0.014106452465057373, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 218.921875, "completions/mean_terminated_length": 218.921875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.30958008766174316, "epoch": 0.21200980392156862, "frac_reward_zero_std": 1.0, "grad_norm": 0.012583170300759833, "kl": 0.003378768917173147, "learning_rate": 6.991869918699187e-07, "loss": 0.0, "num_tokens": 5380228.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5745068788528442, "sampling/importance_sampling_ratio/mean": 1.0001413822174072, "sampling/importance_sampling_ratio/min": 0.6016868948936462, "sampling/sampling_logp_difference/max": 0.5080180168151855, "sampling/sampling_logp_difference/mean": 0.014329886995255947, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 158.9375, "completions/mean_terminated_length": 158.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4220223128795624, "epoch": 0.21323529411764705, "frac_reward_zero_std": 0.75, "grad_norm": 0.9888867576948446, "kl": 0.004015155136585236, "learning_rate": 7.032520325203252e-07, "loss": -0.0017, "num_tokens": 5409168.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6091922521591187, "sampling/importance_sampling_ratio/mean": 0.9998781681060791, "sampling/importance_sampling_ratio/min": 0.6208694577217102, "sampling/sampling_logp_difference/max": 0.47663450241088867, "sampling/sampling_logp_difference/mean": 0.019033968448638916, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 239.890625, "completions/mean_terminated_length": 239.890625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.32471412420272827, "epoch": 0.21446078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.010540768455384751, "kl": 0.002627901965752244, "learning_rate": 7.073170731707316e-07, "loss": 0.0, "num_tokens": 5445769.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5332598686218262, "sampling/importance_sampling_ratio/mean": 1.000173568725586, "sampling/importance_sampling_ratio/min": 0.675683856010437, "sampling/sampling_logp_difference/max": 0.4273960590362549, "sampling/sampling_logp_difference/mean": 0.014171200804412365, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 196.9375, "completions/mean_terminated_length": 196.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4236835539340973, "epoch": 0.21568627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.013264606020078321, "kl": 0.00399052444845438, "learning_rate": 7.113821138211382e-07, "loss": 0.0, "num_tokens": 5480949.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4288609027862549, "sampling/importance_sampling_ratio/mean": 0.9993376731872559, "sampling/importance_sampling_ratio/min": 0.6968380808830261, "sampling/sampling_logp_difference/max": 0.3612022399902344, "sampling/sampling_logp_difference/mean": 0.01731596142053604, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3174845576286316, "epoch": 0.21691176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.012555111751764091, "kl": 0.0028034732677042484, "learning_rate": 7.154471544715447e-07, "loss": 0.0, "num_tokens": 5517717.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6263964176177979, "sampling/importance_sampling_ratio/mean": 1.0002188682556152, "sampling/importance_sampling_ratio/min": 0.6396260857582092, "sampling/sampling_logp_difference/max": 0.48636674880981445, "sampling/sampling_logp_difference/mean": 0.014294363558292389, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 204.046875, "completions/mean_terminated_length": 204.046875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2718321681022644, "epoch": 0.2181372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.011483939277446507, "kl": 0.00246023153886199, "learning_rate": 7.195121951219512e-07, "loss": 0.0, "num_tokens": 5551096.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3715288639068604, "sampling/importance_sampling_ratio/mean": 0.9999605417251587, "sampling/importance_sampling_ratio/min": 0.6878846883773804, "sampling/sampling_logp_difference/max": 0.3741340637207031, "sampling/sampling_logp_difference/mean": 0.013501793146133423, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 195.578125, "completions/mean_terminated_length": 195.578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3303038775920868, "epoch": 0.2193627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 1.0069651822560624, "kl": 0.0032733306288719177, "learning_rate": 7.235772357723577e-07, "loss": 0.0362, "num_tokens": 5588973.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4814108610153198, "sampling/importance_sampling_ratio/mean": 0.9997945427894592, "sampling/importance_sampling_ratio/min": 0.4739038646221161, "sampling/sampling_logp_difference/max": 0.7467508316040039, "sampling/sampling_logp_difference/mean": 0.014692796394228935, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.41069531440734863, "epoch": 0.22058823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 1.3343516607581438, "kl": 0.0040096924640238285, "learning_rate": 7.276422764227642e-07, "loss": -0.0364, "num_tokens": 5619933.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4864038228988647, "sampling/importance_sampling_ratio/mean": 0.9999009370803833, "sampling/importance_sampling_ratio/min": 0.6505169868469238, "sampling/sampling_logp_difference/max": 0.42998790740966797, "sampling/sampling_logp_difference/mean": 0.018758460879325867, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3594907224178314, "epoch": 0.22181372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 1.310196378375213, "kl": 0.00374414399266243, "learning_rate": 7.317073170731707e-07, "loss": -0.0079, "num_tokens": 5648913.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5919442176818848, "sampling/importance_sampling_ratio/mean": 1.0004675388336182, "sampling/importance_sampling_ratio/min": 0.6224140524864197, "sampling/sampling_logp_difference/max": 0.4741497039794922, "sampling/sampling_logp_difference/mean": 0.01597871072590351, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 193.1875, "completions/mean_terminated_length": 193.1875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3262665271759033, "epoch": 0.22303921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.017245806061220382, "kl": 0.0042599341832101345, "learning_rate": 7.357723577235772e-07, "loss": 0.0, "num_tokens": 5683837.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.506589412689209, "sampling/importance_sampling_ratio/mean": 0.9992817640304565, "sampling/importance_sampling_ratio/min": 0.6075704097747803, "sampling/sampling_logp_difference/max": 0.4982872009277344, "sampling/sampling_logp_difference/mean": 0.01512528583407402, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 181.078125, "completions/mean_terminated_length": 181.078125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.32310307025909424, "epoch": 0.22426470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.012105599319543987, "kl": 0.003585977014154196, "learning_rate": 7.398373983739837e-07, "loss": 0.0, "num_tokens": 5715874.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4722709655761719, "sampling/importance_sampling_ratio/mean": 1.0006076097488403, "sampling/importance_sampling_ratio/min": 0.6348845958709717, "sampling/sampling_logp_difference/max": 0.4543120861053467, "sampling/sampling_logp_difference/mean": 0.01581915281713009, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 193.015625, "completions/mean_terminated_length": 193.015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.40230071544647217, "epoch": 0.22549019607843138, "frac_reward_zero_std": 0.75, "grad_norm": 1.1221218071458032, "kl": 0.004614609759300947, "learning_rate": 7.439024390243903e-07, "loss": 0.0372, "num_tokens": 5743107.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9996156692504883, "sampling/importance_sampling_ratio/min": 0.6805951595306396, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.01691802218556404, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 181.578125, "completions/mean_terminated_length": 181.578125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.46721404790878296, "epoch": 0.2267156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.4089179931957916, "kl": 0.006852032616734505, "learning_rate": 7.479674796747967e-07, "loss": 0.0262, "num_tokens": 5772184.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6012070178985596, "sampling/importance_sampling_ratio/mean": 0.999984860420227, "sampling/importance_sampling_ratio/min": 0.6403137445449829, "sampling/sampling_logp_difference/max": 0.47075772285461426, "sampling/sampling_logp_difference/mean": 0.01858537644147873, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 180.03125, "completions/mean_terminated_length": 180.03125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3536166846752167, "epoch": 0.22794117647058823, "frac_reward_zero_std": 1.0, "grad_norm": 0.013904942323311334, "kl": 0.004634760785847902, "learning_rate": 7.520325203252032e-07, "loss": 0.0, "num_tokens": 5804314.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5369963645935059, "sampling/importance_sampling_ratio/mean": 1.0005981922149658, "sampling/importance_sampling_ratio/min": 0.6708944439888, "sampling/sampling_logp_difference/max": 0.42983007431030273, "sampling/sampling_logp_difference/mean": 0.015495835803449154, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 213.890625, "completions/mean_terminated_length": 213.890625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.44987189769744873, "epoch": 0.22916666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.9168057676492665, "kl": 0.005267436150461435, "learning_rate": 7.560975609756097e-07, "loss": 0.0137, "num_tokens": 5836579.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3781383037567139, "sampling/importance_sampling_ratio/mean": 0.9997191429138184, "sampling/importance_sampling_ratio/min": 0.5998452305793762, "sampling/sampling_logp_difference/max": 0.5110836029052734, "sampling/sampling_logp_difference/mean": 0.017986461520195007, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 218.234375, "completions/mean_terminated_length": 218.234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4195953607559204, "epoch": 0.23039215686274508, "frac_reward_zero_std": 0.75, "grad_norm": 1.0181818507958051, "kl": 0.004643009044229984, "learning_rate": 7.601626016260162e-07, "loss": -0.0348, "num_tokens": 5877618.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002375841140747, "sampling/importance_sampling_ratio/min": 0.6030285954475403, "sampling/sampling_logp_difference/max": 1.470839500427246, "sampling/sampling_logp_difference/mean": 0.017538927495479584, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 186.34375, "completions/mean_terminated_length": 186.34375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.38808345794677734, "epoch": 0.23161764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01810426589484519, "kl": 0.005774513818323612, "learning_rate": 7.642276422764228e-07, "loss": 0.0001, "num_tokens": 5905400.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4132622480392456, "sampling/importance_sampling_ratio/mean": 0.9998592734336853, "sampling/importance_sampling_ratio/min": 0.6387045979499817, "sampling/sampling_logp_difference/max": 0.44831323623657227, "sampling/sampling_logp_difference/mean": 0.016829874366521835, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 129.3125, "completions/mean_terminated_length": 129.3125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3367841839790344, "epoch": 0.23284313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.4615088240305407, "kl": 0.009488044306635857, "learning_rate": 7.682926829268292e-07, "loss": -0.0405, "num_tokens": 5928972.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4128563404083252, "sampling/importance_sampling_ratio/mean": 0.9995773434638977, "sampling/importance_sampling_ratio/min": 0.5278366208076477, "sampling/sampling_logp_difference/max": 0.6389684677124023, "sampling/sampling_logp_difference/mean": 0.017004843801259995, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 212.15625, "completions/mean_terminated_length": 212.15625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.40614062547683716, "epoch": 0.2340686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.019031274405640942, "kl": 0.00624141376465559, "learning_rate": 7.723577235772358e-07, "loss": 0.0001, "num_tokens": 5964614.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7942405939102173, "sampling/importance_sampling_ratio/mean": 0.9996316432952881, "sampling/importance_sampling_ratio/min": 0.6298720836639404, "sampling/sampling_logp_difference/max": 0.5845818519592285, "sampling/sampling_logp_difference/mean": 0.017022619023919106, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 224.171875, "completions/mean_terminated_length": 224.171875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.37709319591522217, "epoch": 0.23529411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 0.8466587922316804, "kl": 0.006414324976503849, "learning_rate": 7.764227642276422e-07, "loss": -0.0201, "num_tokens": 5994145.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5532957315444946, "sampling/importance_sampling_ratio/mean": 0.9999184012413025, "sampling/importance_sampling_ratio/min": 0.6281425356864929, "sampling/sampling_logp_difference/max": 0.46498823165893555, "sampling/sampling_logp_difference/mean": 0.015568745322525501, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 289.28125, "completions/mean_terminated_length": 289.28125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3816249668598175, "epoch": 0.23651960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 0.7064395936203522, "kl": 0.0034015390556305647, "learning_rate": 7.804878048780488e-07, "loss": 0.0145, "num_tokens": 6037539.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.498374342918396, "sampling/importance_sampling_ratio/mean": 0.9999957084655762, "sampling/importance_sampling_ratio/min": 0.6288349032402039, "sampling/sampling_logp_difference/max": 0.4638864994049072, "sampling/sampling_logp_difference/mean": 0.01520681381225586, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 241.546875, "completions/mean_terminated_length": 241.546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3961125612258911, "epoch": 0.23774509803921567, "frac_reward_zero_std": 0.75, "grad_norm": 0.782659026683507, "kl": 0.006137443706393242, "learning_rate": 7.845528455284552e-07, "loss": -0.0032, "num_tokens": 6069014.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.484194040298462, "sampling/importance_sampling_ratio/mean": 1.000030517578125, "sampling/importance_sampling_ratio/min": 0.5979025363922119, "sampling/sampling_logp_difference/max": 0.5143275260925293, "sampling/sampling_logp_difference/mean": 0.01631583273410797, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 251.78125, "completions/mean_terminated_length": 251.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37602466344833374, "epoch": 0.23897058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.014806989123684491, "kl": 0.0050431289710104465, "learning_rate": 7.886178861788617e-07, "loss": 0.0, "num_tokens": 6104072.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7715590000152588, "sampling/importance_sampling_ratio/mean": 1.0002853870391846, "sampling/importance_sampling_ratio/min": 0.6202758550643921, "sampling/sampling_logp_difference/max": 0.5718599557876587, "sampling/sampling_logp_difference/mean": 0.015946928411722183, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 181.453125, "completions/mean_terminated_length": 181.453125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.31748703122138977, "epoch": 0.24019607843137256, "frac_reward_zero_std": 1.0, "grad_norm": 0.022062487150172987, "kl": 0.006586791016161442, "learning_rate": 7.926829268292683e-07, "loss": 0.0001, "num_tokens": 6132069.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5275835990905762, "sampling/importance_sampling_ratio/mean": 0.9999291300773621, "sampling/importance_sampling_ratio/min": 0.6065285205841064, "sampling/sampling_logp_difference/max": 0.5000035762786865, "sampling/sampling_logp_difference/mean": 0.015199665911495686, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 198.5625, "completions/mean_terminated_length": 198.5625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3671415448188782, "epoch": 0.24142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.01974363161673683, "kl": 0.006311338860541582, "learning_rate": 7.967479674796747e-07, "loss": 0.0001, "num_tokens": 6164473.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.367321252822876, "sampling/importance_sampling_ratio/mean": 1.000627040863037, "sampling/importance_sampling_ratio/min": 0.6262636184692383, "sampling/sampling_logp_difference/max": 0.4679839611053467, "sampling/sampling_logp_difference/mean": 0.014926253817975521, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 184.546875, "completions/mean_terminated_length": 184.546875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.35503798723220825, "epoch": 0.2426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9855354953030266, "kl": 0.0070960987359285355, "learning_rate": 8.008130081300813e-07, "loss": 0.0046, "num_tokens": 6189756.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.8739265203475952, "sampling/importance_sampling_ratio/mean": 1.000748634338379, "sampling/importance_sampling_ratio/min": 0.6033843755722046, "sampling/sampling_logp_difference/max": 0.6280360221862793, "sampling/sampling_logp_difference/mean": 0.01589926704764366, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 214.265625, "completions/mean_terminated_length": 214.265625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.37239372730255127, "epoch": 0.24387254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.0217863839072463, "kl": 0.006487885490059853, "learning_rate": 8.048780487804878e-07, "loss": -0.0282, "num_tokens": 6222333.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.320409893989563, "sampling/importance_sampling_ratio/mean": 0.9995848536491394, "sampling/importance_sampling_ratio/min": 0.654735803604126, "sampling/sampling_logp_difference/max": 0.4235234260559082, "sampling/sampling_logp_difference/mean": 0.014400139451026917, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 226.921875, "completions/mean_terminated_length": 226.921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.34588587284088135, "epoch": 0.24509803921568626, "frac_reward_zero_std": 1.0, "grad_norm": 0.015066589558551828, "kl": 0.00500678364187479, "learning_rate": 8.089430894308943e-07, "loss": 0.0, "num_tokens": 6262344.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9079806804656982, "sampling/importance_sampling_ratio/mean": 1.000096321105957, "sampling/importance_sampling_ratio/min": 0.4871023893356323, "sampling/sampling_logp_difference/max": 0.7192809581756592, "sampling/sampling_logp_difference/mean": 0.014446980319917202, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 200.984375, "completions/mean_terminated_length": 200.984375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4419822096824646, "epoch": 0.24632352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.017528080005748545, "kl": 0.005993842612951994, "learning_rate": 8.130081300813008e-07, "loss": 0.0001, "num_tokens": 6292359.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4518470764160156, "sampling/importance_sampling_ratio/mean": 1.000361442565918, "sampling/importance_sampling_ratio/min": 0.5999310612678528, "sampling/sampling_logp_difference/max": 0.5109405517578125, "sampling/sampling_logp_difference/mean": 0.017505858093500137, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3625558614730835, "epoch": 0.24754901960784315, "frac_reward_zero_std": 0.25, "grad_norm": 1.4113681466450303, "kl": 0.0048790015280246735, "learning_rate": 8.170731707317072e-07, "loss": -0.0063, "num_tokens": 6326223.0, "reward": 0.71875, "reward_std": 0.5722135901451111, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.326668381690979, "sampling/importance_sampling_ratio/mean": 1.000248670578003, "sampling/importance_sampling_ratio/min": 0.60129714012146, "sampling/sampling_logp_difference/max": 0.5086660385131836, "sampling/sampling_logp_difference/mean": 0.014618618413805962, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 132.265625, "completions/mean_terminated_length": 132.265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.33129891753196716, "epoch": 0.24877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02401984277029526, "kl": 0.0077463616617023945, "learning_rate": 8.211382113821138e-07, "loss": 0.0001, "num_tokens": 6349680.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6219075918197632, "sampling/importance_sampling_ratio/mean": 0.999733567237854, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.48360300064086914, "sampling/sampling_logp_difference/mean": 0.015228422358632088, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 161.734375, "completions/mean_terminated_length": 161.734375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3452780246734619, "epoch": 0.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.018051280744605103, "kl": 0.005515796132385731, "learning_rate": 8.252032520325202e-07, "loss": 0.0001, "num_tokens": 6381487.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4702109098434448, "sampling/importance_sampling_ratio/mean": 1.0000395774841309, "sampling/importance_sampling_ratio/min": 0.5659708380699158, "sampling/sampling_logp_difference/max": 0.5692126750946045, "sampling/sampling_logp_difference/mean": 0.01679180935025215, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 205.28125, "completions/mean_terminated_length": 205.28125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3766824007034302, "epoch": 0.2512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.7310462494103193, "kl": 0.006803940050303936, "learning_rate": 8.292682926829268e-07, "loss": 0.0159, "num_tokens": 6410017.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6630347967147827, "sampling/importance_sampling_ratio/mean": 1.000176191329956, "sampling/importance_sampling_ratio/min": 0.6379572153091431, "sampling/sampling_logp_difference/max": 0.5086441040039062, "sampling/sampling_logp_difference/mean": 0.014964250847697258, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3047809898853302, "epoch": 0.25245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.018410139615306992, "kl": 0.006978918798267841, "learning_rate": 8.333333333333333e-07, "loss": 0.0001, "num_tokens": 6441881.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.57449209690094, "sampling/importance_sampling_ratio/mean": 0.9996367692947388, "sampling/importance_sampling_ratio/min": 0.6156854033470154, "sampling/sampling_logp_difference/max": 0.4850192070007324, "sampling/sampling_logp_difference/mean": 0.014237132854759693, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 244.40625, "completions/mean_terminated_length": 244.40625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4763853847980499, "epoch": 0.2536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.742727042758716, "kl": 0.0060688345693051815, "learning_rate": 8.373983739837398e-07, "loss": -0.0075, "num_tokens": 6475699.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6098331212997437, "sampling/importance_sampling_ratio/mean": 1.000468373298645, "sampling/importance_sampling_ratio/min": 0.6110594272613525, "sampling/sampling_logp_difference/max": 0.49256110191345215, "sampling/sampling_logp_difference/mean": 0.017908930778503418, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4074808657169342, "epoch": 0.2549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9959708854852373, "kl": 0.00976475328207016, "learning_rate": 8.414634146341463e-07, "loss": -0.0133, "num_tokens": 6502783.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5459479093551636, "sampling/importance_sampling_ratio/mean": 1.0006380081176758, "sampling/importance_sampling_ratio/min": 0.62617027759552, "sampling/sampling_logp_difference/max": 0.46813297271728516, "sampling/sampling_logp_difference/mean": 0.016537927091121674, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 157.953125, "completions/mean_terminated_length": 157.953125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4467363953590393, "epoch": 0.25612745098039214, "frac_reward_zero_std": 0.75, "grad_norm": 1.1726912773808704, "kl": 0.009501153603196144, "learning_rate": 8.455284552845529e-07, "loss": 0.0232, "num_tokens": 6527932.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4439783096313477, "sampling/importance_sampling_ratio/mean": 1.0001927614212036, "sampling/importance_sampling_ratio/min": 0.6232355237007141, "sampling/sampling_logp_difference/max": 0.47283077239990234, "sampling/sampling_logp_difference/mean": 0.019368894398212433, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 179.359375, "completions/mean_terminated_length": 179.359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3574153482913971, "epoch": 0.25735294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.1452082427598702, "kl": 0.007702191825956106, "learning_rate": 8.495934959349593e-07, "loss": -0.0059, "num_tokens": 6557443.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.549471139907837, "sampling/importance_sampling_ratio/mean": 1.0003321170806885, "sampling/importance_sampling_ratio/min": 0.7380995154380798, "sampling/sampling_logp_difference/max": 0.4379136562347412, "sampling/sampling_logp_difference/mean": 0.014593801461160183, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 195.640625, "completions/mean_terminated_length": 195.640625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4797608256340027, "epoch": 0.25857843137254904, "frac_reward_zero_std": 0.75, "grad_norm": 0.786825032092835, "kl": 0.009710676968097687, "learning_rate": 8.536585365853657e-07, "loss": -0.0074, "num_tokens": 6587948.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9994493722915649, "sampling/importance_sampling_ratio/min": 0.6035121083259583, "sampling/sampling_logp_difference/max": 0.5049891471862793, "sampling/sampling_logp_difference/mean": 0.018817134201526642, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4090731739997864, "epoch": 0.25980392156862747, "frac_reward_zero_std": 1.0, "grad_norm": 0.02247531717029179, "kl": 0.008128427900373936, "learning_rate": 8.577235772357723e-07, "loss": 0.0001, "num_tokens": 6613948.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4769021272659302, "sampling/importance_sampling_ratio/mean": 0.9996882677078247, "sampling/importance_sampling_ratio/min": 0.6154875159263611, "sampling/sampling_logp_difference/max": 0.48534059524536133, "sampling/sampling_logp_difference/mean": 0.01656338945031166, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4429599642753601, "epoch": 0.2610294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.875112258366726, "kl": 0.007630010601133108, "learning_rate": 8.617886178861788e-07, "loss": -0.0137, "num_tokens": 6650524.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.7480331659317017, "sampling/importance_sampling_ratio/mean": 1.0001615285873413, "sampling/importance_sampling_ratio/min": 0.6175042986869812, "sampling/sampling_logp_difference/max": 0.5584912300109863, "sampling/sampling_logp_difference/mean": 0.01645074039697647, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 244.96875, "completions/mean_terminated_length": 244.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.45601630210876465, "epoch": 0.2622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01924254280240346, "kl": 0.00718282163143158, "learning_rate": 8.658536585365853e-07, "loss": 0.0001, "num_tokens": 6690858.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.699453592300415, "sampling/importance_sampling_ratio/mean": 0.9991973638534546, "sampling/importance_sampling_ratio/min": 0.6268161535263062, "sampling/sampling_logp_difference/max": 0.5303068161010742, "sampling/sampling_logp_difference/mean": 0.016671188175678253, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 182.21875, "completions/mean_terminated_length": 182.21875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.41794514656066895, "epoch": 0.26348039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.025044838988155363, "kl": 0.009658871218562126, "learning_rate": 8.699186991869918e-07, "loss": 0.0001, "num_tokens": 6730216.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.873031497001648, "sampling/importance_sampling_ratio/mean": 0.9994128942489624, "sampling/importance_sampling_ratio/min": 0.6774318814277649, "sampling/sampling_logp_difference/max": 0.6275582313537598, "sampling/sampling_logp_difference/mean": 0.01576501503586769, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3788728713989258, "epoch": 0.2647058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.021093975561889924, "kl": 0.009017504751682281, "learning_rate": 8.739837398373984e-07, "loss": 0.0001, "num_tokens": 6764600.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4169543981552124, "sampling/importance_sampling_ratio/mean": 1.000279188156128, "sampling/importance_sampling_ratio/min": 0.5626490712165833, "sampling/sampling_logp_difference/max": 0.5750991106033325, "sampling/sampling_logp_difference/mean": 0.014173740521073341, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 204.015625, "completions/mean_terminated_length": 204.015625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.389233261346817, "epoch": 0.2659313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.024991982444765946, "kl": 0.011074772104620934, "learning_rate": 8.780487804878048e-07, "loss": 0.0001, "num_tokens": 6796729.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.401310920715332, "sampling/importance_sampling_ratio/mean": 1.0005078315734863, "sampling/importance_sampling_ratio/min": 0.677149772644043, "sampling/sampling_logp_difference/max": 0.3898627758026123, "sampling/sampling_logp_difference/mean": 0.014612487517297268, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 219.15625, "completions/mean_terminated_length": 219.15625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3317946195602417, "epoch": 0.26715686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.021233170321532966, "kl": 0.008910744450986385, "learning_rate": 8.821138211382113e-07, "loss": 0.0001, "num_tokens": 6831971.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.327485203742981, "sampling/importance_sampling_ratio/mean": 0.9994485378265381, "sampling/importance_sampling_ratio/min": 0.6925040483474731, "sampling/sampling_logp_difference/max": 0.36744117736816406, "sampling/sampling_logp_difference/mean": 0.01264607347548008, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4074782133102417, "epoch": 0.26838235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.035860313956660055, "kl": 0.013461483642458916, "learning_rate": 8.861788617886179e-07, "loss": 0.0001, "num_tokens": 6861071.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4998352527618408, "sampling/importance_sampling_ratio/mean": 0.9996261596679688, "sampling/importance_sampling_ratio/min": 0.614910364151001, "sampling/sampling_logp_difference/max": 0.486278772354126, "sampling/sampling_logp_difference/mean": 0.01648368313908577, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 210.859375, "completions/mean_terminated_length": 210.859375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4149256944656372, "epoch": 0.2696078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.2233127015137235, "kl": 0.011519803665578365, "learning_rate": 8.902439024390244e-07, "loss": -0.0113, "num_tokens": 6892294.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4479440450668335, "sampling/importance_sampling_ratio/mean": 1.0004557371139526, "sampling/importance_sampling_ratio/min": 0.6886289715766907, "sampling/sampling_logp_difference/max": 0.37305259704589844, "sampling/sampling_logp_difference/mean": 0.01560588926076889, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 202.28125, "completions/mean_terminated_length": 202.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.5796515941619873, "epoch": 0.2708333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.9406076674499578, "kl": 0.012280515395104885, "learning_rate": 8.943089430894308e-07, "loss": 0.0045, "num_tokens": 6920904.0, "reward": -0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3890950679779053, "sampling/importance_sampling_ratio/mean": 0.9999018907546997, "sampling/importance_sampling_ratio/min": 0.6918495297431946, "sampling/sampling_logp_difference/max": 0.36838674545288086, "sampling/sampling_logp_difference/mean": 0.018281951546669006, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 177.34375, "completions/mean_terminated_length": 177.34375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.38370460271835327, "epoch": 0.27205882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.02551354122962791, "kl": 0.011651578359305859, "learning_rate": 8.983739837398373e-07, "loss": 0.0001, "num_tokens": 6952062.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6262208223342896, "sampling/importance_sampling_ratio/mean": 1.0000596046447754, "sampling/importance_sampling_ratio/min": 0.6893720030784607, "sampling/sampling_logp_difference/max": 0.48625874519348145, "sampling/sampling_logp_difference/mean": 0.014482217840850353, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 227.609375, "completions/mean_terminated_length": 227.609375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.40940508246421814, "epoch": 0.27328431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.02309105560794474, "kl": 0.00955723226070404, "learning_rate": 9.024390243902439e-07, "loss": 0.0001, "num_tokens": 6985877.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4982682466506958, "sampling/importance_sampling_ratio/mean": 0.999754011631012, "sampling/importance_sampling_ratio/min": 0.6371425986289978, "sampling/sampling_logp_difference/max": 0.4507617950439453, "sampling/sampling_logp_difference/mean": 0.0144026018679142, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 205.328125, "completions/mean_terminated_length": 205.328125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4640798568725586, "epoch": 0.27450980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.025321638175525355, "kl": 0.010854870080947876, "learning_rate": 9.065040650406503e-07, "loss": 0.0001, "num_tokens": 7018010.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5766310691833496, "sampling/importance_sampling_ratio/mean": 0.9997616410255432, "sampling/importance_sampling_ratio/min": 0.6246612668037415, "sampling/sampling_logp_difference/max": 0.47054576873779297, "sampling/sampling_logp_difference/mean": 0.01709725335240364, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 216.078125, "completions/mean_terminated_length": 216.078125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3811000883579254, "epoch": 0.2757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.025339046862078777, "kl": 0.011608104221522808, "learning_rate": 9.105691056910569e-07, "loss": 0.0001, "num_tokens": 7047103.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4439653158187866, "sampling/importance_sampling_ratio/mean": 0.9998691082000732, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.4335259199142456, "sampling/sampling_logp_difference/mean": 0.014265717938542366, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 205.359375, "completions/mean_terminated_length": 205.359375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4888272285461426, "epoch": 0.2769607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.9265821285741606, "kl": 0.013852481730282307, "learning_rate": 9.146341463414634e-07, "loss": 0.0056, "num_tokens": 7076422.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3700966835021973, "sampling/importance_sampling_ratio/mean": 0.9996808767318726, "sampling/importance_sampling_ratio/min": 0.7715784907341003, "sampling/sampling_logp_difference/max": 0.3148813247680664, "sampling/sampling_logp_difference/mean": 0.01662006415426731, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3419240117073059, "epoch": 0.27818627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.03017489332894213, "kl": 0.014357686042785645, "learning_rate": 9.186991869918699e-07, "loss": 0.0001, "num_tokens": 7105426.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5626256465911865, "sampling/importance_sampling_ratio/mean": 0.999943733215332, "sampling/importance_sampling_ratio/min": 0.6393700242042542, "sampling/sampling_logp_difference/max": 0.4472719430923462, "sampling/sampling_logp_difference/mean": 0.014091861434280872, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 238.265625, "completions/mean_terminated_length": 238.265625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.38244467973709106, "epoch": 0.27941176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.021481967245729417, "kl": 0.009679099544882774, "learning_rate": 9.227642276422763e-07, "loss": 0.0001, "num_tokens": 7142787.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4060695171356201, "sampling/importance_sampling_ratio/mean": 1.0001033544540405, "sampling/importance_sampling_ratio/min": 0.7145366072654724, "sampling/sampling_logp_difference/max": 0.3407982587814331, "sampling/sampling_logp_difference/mean": 0.013826992362737656, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 206.09375, "completions/mean_terminated_length": 206.09375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.41250723600387573, "epoch": 0.2806372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.6999583230241203, "kl": 0.011442895978689194, "learning_rate": 9.26829268292683e-07, "loss": 0.003, "num_tokens": 7175609.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4759176969528198, "sampling/importance_sampling_ratio/mean": 0.9998277425765991, "sampling/importance_sampling_ratio/min": 0.6153390407562256, "sampling/sampling_logp_difference/max": 0.4855818748474121, "sampling/sampling_logp_difference/mean": 0.01563001424074173, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 215.171875, "completions/mean_terminated_length": 215.171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.47626060247421265, "epoch": 0.2818627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.02398809377788757, "kl": 0.0111317690461874, "learning_rate": 9.308943089430894e-07, "loss": 0.0001, "num_tokens": 7219956.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6365751028060913, "sampling/importance_sampling_ratio/mean": 1.0000046491622925, "sampling/importance_sampling_ratio/min": 0.6299163699150085, "sampling/sampling_logp_difference/max": 0.49260568618774414, "sampling/sampling_logp_difference/mean": 0.017304297536611557, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 221.953125, "completions/mean_terminated_length": 221.953125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3158389925956726, "epoch": 0.28308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.016257277204131226, "kl": 0.006831689737737179, "learning_rate": 9.349593495934958e-07, "loss": 0.0001, "num_tokens": 7251873.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997228384017944, "sampling/importance_sampling_ratio/min": 0.6526646614074707, "sampling/sampling_logp_difference/max": 0.7052080631256104, "sampling/sampling_logp_difference/mean": 0.011903062462806702, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 178.953125, "completions/mean_terminated_length": 178.953125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4166858196258545, "epoch": 0.28431372549019607, "frac_reward_zero_std": 1.0, "grad_norm": 0.024739217097453914, "kl": 0.010809991508722305, "learning_rate": 9.390243902439024e-07, "loss": 0.0001, "num_tokens": 7283470.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.39656400680542, "sampling/importance_sampling_ratio/mean": 1.0000054836273193, "sampling/importance_sampling_ratio/min": 0.4615365266799927, "sampling/sampling_logp_difference/max": 0.7731940746307373, "sampling/sampling_logp_difference/mean": 0.01575036346912384, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 207.078125, "completions/mean_terminated_length": 207.078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.42489081621170044, "epoch": 0.2855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.8525932508928461, "kl": 0.011268509551882744, "learning_rate": 9.430894308943089e-07, "loss": -0.0118, "num_tokens": 7311683.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3816370964050293, "sampling/importance_sampling_ratio/mean": 1.0003949403762817, "sampling/importance_sampling_ratio/min": 0.6659024357795715, "sampling/sampling_logp_difference/max": 0.4066121578216553, "sampling/sampling_logp_difference/mean": 0.016137288883328438, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 281.765625, "completions/mean_terminated_length": 281.765625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.44613972306251526, "epoch": 0.2867647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 0.8742479747553392, "kl": 0.0076088979840278625, "learning_rate": 9.471544715447154e-07, "loss": -0.0017, "num_tokens": 7352052.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9099587202072144, "sampling/importance_sampling_ratio/mean": 1.000007152557373, "sampling/importance_sampling_ratio/min": 0.4805276095867157, "sampling/sampling_logp_difference/max": 0.7328705787658691, "sampling/sampling_logp_difference/mean": 0.014782894402742386, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3496655821800232, "epoch": 0.28799019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.02338081316659742, "kl": 0.00958043709397316, "learning_rate": 9.512195121951218e-07, "loss": 0.0001, "num_tokens": 7383184.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.494954228401184, "sampling/importance_sampling_ratio/mean": 1.0002262592315674, "sampling/importance_sampling_ratio/min": 0.695496678352356, "sampling/sampling_logp_difference/max": 0.4020955562591553, "sampling/sampling_logp_difference/mean": 0.014258678071200848, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 159.953125, "completions/mean_terminated_length": 159.953125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.45076289772987366, "epoch": 0.28921568627450983, "frac_reward_zero_std": 0.5, "grad_norm": 1.6534245681919217, "kl": 0.01330963708460331, "learning_rate": 9.552845528455285e-07, "loss": 0.047, "num_tokens": 7407949.0, "reward": 0.1875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6504887342453003, "sampling/importance_sampling_ratio/mean": 1.0006005764007568, "sampling/importance_sampling_ratio/min": 0.6347039937973022, "sampling/sampling_logp_difference/max": 0.5010714530944824, "sampling/sampling_logp_difference/mean": 0.019480448216199875, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 202.34375, "completions/mean_terminated_length": 202.34375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4706074893474579, "epoch": 0.29044117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.02233542927478274, "kl": 0.009093056432902813, "learning_rate": 9.59349593495935e-07, "loss": 0.0001, "num_tokens": 7455491.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3603936433792114, "sampling/importance_sampling_ratio/mean": 1.000697135925293, "sampling/importance_sampling_ratio/min": 0.6341907978057861, "sampling/sampling_logp_difference/max": 0.45540547370910645, "sampling/sampling_logp_difference/mean": 0.017054909840226173, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 184.8125, "completions/mean_terminated_length": 184.8125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.40207499265670776, "epoch": 0.2916666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 1.0504165522851772, "kl": 0.009620942175388336, "learning_rate": 9.634146341463414e-07, "loss": 0.013, "num_tokens": 7481543.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5662195682525635, "sampling/importance_sampling_ratio/mean": 1.0004643201828003, "sampling/importance_sampling_ratio/min": 0.6290379762649536, "sampling/sampling_logp_difference/max": 0.4635636806488037, "sampling/sampling_logp_difference/mean": 0.01674029976129532, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 194.296875, "completions/mean_terminated_length": 194.296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.37485432624816895, "epoch": 0.2928921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.021978196293519697, "kl": 0.008883722126483917, "learning_rate": 9.67479674796748e-07, "loss": 0.0001, "num_tokens": 7514042.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.576985239982605, "sampling/importance_sampling_ratio/mean": 1.000411033630371, "sampling/importance_sampling_ratio/min": 0.657006025314331, "sampling/sampling_logp_difference/max": 0.45551490783691406, "sampling/sampling_logp_difference/mean": 0.016208358108997345, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 217.609375, "completions/mean_terminated_length": 217.609375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3677450716495514, "epoch": 0.29411764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.01726305153614504, "kl": 0.006754512898623943, "learning_rate": 9.715447154471544e-07, "loss": 0.0001, "num_tokens": 7545217.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.530495047569275, "sampling/importance_sampling_ratio/mean": 0.9997333884239197, "sampling/importance_sampling_ratio/min": 0.541140615940094, "sampling/sampling_logp_difference/max": 0.6140761375427246, "sampling/sampling_logp_difference/mean": 0.014982339926064014, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 187.515625, "completions/mean_terminated_length": 187.515625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3908500075340271, "epoch": 0.29534313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.018932848100693202, "kl": 0.00762729998677969, "learning_rate": 9.756097560975609e-07, "loss": 0.0001, "num_tokens": 7573250.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.508407711982727, "sampling/importance_sampling_ratio/mean": 0.9994632005691528, "sampling/importance_sampling_ratio/min": 0.630445122718811, "sampling/sampling_logp_difference/max": 0.46132922172546387, "sampling/sampling_logp_difference/mean": 0.01583741419017315, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 216.390625, "completions/mean_terminated_length": 216.390625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.42570817470550537, "epoch": 0.2965686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.9179455485735144, "kl": 0.007978806272149086, "learning_rate": 9.796747967479673e-07, "loss": 0.0111, "num_tokens": 7612923.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998685121536255, "sampling/importance_sampling_ratio/min": 0.6407722234725952, "sampling/sampling_logp_difference/max": 0.8378071784973145, "sampling/sampling_logp_difference/mean": 0.01628207042813301, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 173.921875, "completions/mean_terminated_length": 173.921875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.40627628564834595, "epoch": 0.2977941176470588, "frac_reward_zero_std": 0.5, "grad_norm": 1.6958476637616493, "kl": 0.008724816143512726, "learning_rate": 9.83739837398374e-07, "loss": 0.0474, "num_tokens": 7639942.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6157478094100952, "sampling/importance_sampling_ratio/mean": 0.999195396900177, "sampling/importance_sampling_ratio/min": 0.6374641060829163, "sampling/sampling_logp_difference/max": 0.4797978401184082, "sampling/sampling_logp_difference/mean": 0.01582256704568863, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 168.8125, "completions/mean_terminated_length": 168.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.40013957023620605, "epoch": 0.29901960784313725, "frac_reward_zero_std": 0.75, "grad_norm": 1.2749960429030391, "kl": 0.009192371740937233, "learning_rate": 9.878048780487804e-07, "loss": 0.0099, "num_tokens": 7667978.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4499542713165283, "sampling/importance_sampling_ratio/mean": 1.000160813331604, "sampling/importance_sampling_ratio/min": 0.5401555299758911, "sampling/sampling_logp_difference/max": 0.6158981323242188, "sampling/sampling_logp_difference/mean": 0.01715931110084057, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 194.984375, "completions/mean_terminated_length": 194.984375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.44018101692199707, "epoch": 0.3002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.021884061756286145, "kl": 0.008384596556425095, "learning_rate": 9.918699186991869e-07, "loss": 0.0001, "num_tokens": 7706681.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6211398839950562, "sampling/importance_sampling_ratio/mean": 0.9999797344207764, "sampling/importance_sampling_ratio/min": 0.6720573902130127, "sampling/sampling_logp_difference/max": 0.48312950134277344, "sampling/sampling_logp_difference/mean": 0.016173996031284332, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 221.828125, "completions/mean_terminated_length": 221.828125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.44105350971221924, "epoch": 0.3014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.016795578346263017, "kl": 0.007627334911376238, "learning_rate": 9.959349593495935e-07, "loss": 0.0001, "num_tokens": 7742446.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 0.9993383288383484, "sampling/importance_sampling_ratio/min": 0.49609559774398804, "sampling/sampling_logp_difference/max": 0.7009866237640381, "sampling/sampling_logp_difference/mean": 0.017043959349393845, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 183.90625, "completions/mean_terminated_length": 183.90625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2399781346321106, "epoch": 0.30269607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.019973136134103113, "kl": 0.006935018114745617, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 7769512.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5591720342636108, "sampling/importance_sampling_ratio/mean": 0.9999405145645142, "sampling/importance_sampling_ratio/min": 0.614763617515564, "sampling/sampling_logp_difference/max": 0.48651742935180664, "sampling/sampling_logp_difference/mean": 0.011417325586080551, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 149.71875, "completions/mean_terminated_length": 149.71875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3416861891746521, "epoch": 0.30392156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.026188917228080588, "kl": 0.010590678080916405, "learning_rate": 9.99999492515838e-07, "loss": 0.0001, "num_tokens": 7796614.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7297691106796265, "sampling/importance_sampling_ratio/mean": 1.0001978874206543, "sampling/importance_sampling_ratio/min": 0.617793619632721, "sampling/sampling_logp_difference/max": 0.5479879379272461, "sampling/sampling_logp_difference/mean": 0.016203757375478745, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 191.640625, "completions/mean_terminated_length": 191.640625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.449499249458313, "epoch": 0.30514705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.9278129375037183, "kl": 0.013040348887443542, "learning_rate": 9.99997970064382e-07, "loss": -0.0467, "num_tokens": 7829743.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6324667930603027, "sampling/importance_sampling_ratio/mean": 0.9997720718383789, "sampling/importance_sampling_ratio/min": 0.2991386353969574, "sampling/sampling_logp_difference/max": 1.20684814453125, "sampling/sampling_logp_difference/mean": 0.017926493659615517, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 163.578125, "completions/mean_terminated_length": 163.578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.41771259903907776, "epoch": 0.30637254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.06760607057143501, "kl": 0.014883172698318958, "learning_rate": 9.999954326487227e-07, "loss": 0.0001, "num_tokens": 7855284.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.506048321723938, "sampling/importance_sampling_ratio/mean": 0.9999994039535522, "sampling/importance_sampling_ratio/min": 0.6286977529525757, "sampling/sampling_logp_difference/max": 0.46410465240478516, "sampling/sampling_logp_difference/mean": 0.018769796937704086, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3097831606864929, "epoch": 0.3075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0385912065717164, "kl": 0.011668984778225422, "learning_rate": 9.999918802740106e-07, "loss": 0.0086, "num_tokens": 7877812.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.523362636566162, "sampling/importance_sampling_ratio/mean": 1.0003794431686401, "sampling/importance_sampling_ratio/min": 0.6056217551231384, "sampling/sampling_logp_difference/max": 0.5014996528625488, "sampling/sampling_logp_difference/mean": 0.015177415683865547, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3114874064922333, "epoch": 0.3088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03592110566504937, "kl": 0.013739445246756077, "learning_rate": 9.999873129474573e-07, "loss": 0.0001, "num_tokens": 7906372.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4397625923156738, "sampling/importance_sampling_ratio/mean": 0.9998765587806702, "sampling/importance_sampling_ratio/min": 0.6155447959899902, "sampling/sampling_logp_difference/max": 0.4852476119995117, "sampling/sampling_logp_difference/mean": 0.014856807887554169, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 179.921875, "completions/mean_terminated_length": 179.921875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4216104745864868, "epoch": 0.31004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.9192886737372505, "kl": 0.015545014292001724, "learning_rate": 9.999817306783336e-07, "loss": -0.009, "num_tokens": 7932975.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.463365912437439, "sampling/importance_sampling_ratio/mean": 1.0002342462539673, "sampling/importance_sampling_ratio/min": 0.6248787641525269, "sampling/sampling_logp_difference/max": 0.4701976776123047, "sampling/sampling_logp_difference/mean": 0.017898280173540115, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 147.03125, "completions/mean_terminated_length": 147.03125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3145422339439392, "epoch": 0.3112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03524478669237915, "kl": 0.013532701879739761, "learning_rate": 9.999751334779714e-07, "loss": 0.0001, "num_tokens": 7956961.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.580665111541748, "sampling/importance_sampling_ratio/mean": 1.0000933408737183, "sampling/importance_sampling_ratio/min": 0.6483817100524902, "sampling/sampling_logp_difference/max": 0.45784568786621094, "sampling/sampling_logp_difference/mean": 0.015270882286131382, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 175.578125, "completions/mean_terminated_length": 175.578125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.375484824180603, "epoch": 0.3125, "frac_reward_zero_std": 1.0, "grad_norm": 0.031741820057699044, "kl": 0.011409718543291092, "learning_rate": 9.999675213597626e-07, "loss": 0.0001, "num_tokens": 7987542.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.524396300315857, "sampling/importance_sampling_ratio/mean": 1.0004853010177612, "sampling/importance_sampling_ratio/min": 0.6115931272506714, "sampling/sampling_logp_difference/max": 0.4916880130767822, "sampling/sampling_logp_difference/mean": 0.016877297312021255, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 189.0625, "completions/mean_terminated_length": 189.0625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3867337703704834, "epoch": 0.3137254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.05021085968000162, "kl": 0.012918060645461082, "learning_rate": 9.999588943391595e-07, "loss": 0.0001, "num_tokens": 8017594.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.417964220046997, "sampling/importance_sampling_ratio/mean": 0.9997158050537109, "sampling/importance_sampling_ratio/min": 0.6509522795677185, "sampling/sampling_logp_difference/max": 0.429318904876709, "sampling/sampling_logp_difference/mean": 0.017811531201004982, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 145.34375, "completions/mean_terminated_length": 145.34375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3366623818874359, "epoch": 0.31495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.8153839531646045, "kl": 0.013880794867873192, "learning_rate": 9.999492524336742e-07, "loss": -0.0117, "num_tokens": 8041200.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6287307739257812, "sampling/importance_sampling_ratio/mean": 0.9995911717414856, "sampling/importance_sampling_ratio/min": 0.6185733675956726, "sampling/sampling_logp_difference/max": 0.48780107498168945, "sampling/sampling_logp_difference/mean": 0.016190677881240845, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 166.921875, "completions/mean_terminated_length": 166.921875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3677437901496887, "epoch": 0.3161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.04122863649489965, "kl": 0.015979524701833725, "learning_rate": 9.999385956628792e-07, "loss": 0.0002, "num_tokens": 8066907.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4311892986297607, "sampling/importance_sampling_ratio/mean": 1.000390887260437, "sampling/importance_sampling_ratio/min": 0.6117782592773438, "sampling/sampling_logp_difference/max": 0.49138545989990234, "sampling/sampling_logp_difference/mean": 0.018296608701348305, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 196.4375, "completions/mean_terminated_length": 196.4375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3097396492958069, "epoch": 0.3174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.069374312163895, "kl": 0.011495886370539665, "learning_rate": 9.999269240484069e-07, "loss": -0.0209, "num_tokens": 8097335.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.633513331413269, "sampling/importance_sampling_ratio/mean": 1.0004537105560303, "sampling/importance_sampling_ratio/min": 0.6408920288085938, "sampling/sampling_logp_difference/max": 0.49073314666748047, "sampling/sampling_logp_difference/mean": 0.014788438566029072, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.35195496678352356, "epoch": 0.31862745098039214, "frac_reward_zero_std": 0.75, "grad_norm": 1.024392022286009, "kl": 0.013708166778087616, "learning_rate": 9.999142376139503e-07, "loss": -0.015, "num_tokens": 8129875.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5133965015411377, "sampling/importance_sampling_ratio/mean": 0.9998887777328491, "sampling/importance_sampling_ratio/min": 0.6070231199264526, "sampling/sampling_logp_difference/max": 0.4991884231567383, "sampling/sampling_logp_difference/mean": 0.016092892736196518, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 157.921875, "completions/mean_terminated_length": 157.921875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.309036523103714, "epoch": 0.31985294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.05828600558040216, "kl": 0.016801748424768448, "learning_rate": 9.999005363852617e-07, "loss": 0.0002, "num_tokens": 8155838.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6100053787231445, "sampling/importance_sampling_ratio/mean": 0.999739944934845, "sampling/importance_sampling_ratio/min": 0.6941452622413635, "sampling/sampling_logp_difference/max": 0.47623753547668457, "sampling/sampling_logp_difference/mean": 0.014917733147740364, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 237.421875, "completions/mean_terminated_length": 237.421875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.29509255290031433, "epoch": 0.32107843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.02039980450526182, "kl": 0.008284160867333412, "learning_rate": 9.99885820390154e-07, "loss": 0.0001, "num_tokens": 8190153.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.425432801246643, "sampling/importance_sampling_ratio/mean": 1.000622034072876, "sampling/importance_sampling_ratio/min": 0.6363792419433594, "sampling/sampling_logp_difference/max": 0.45196056365966797, "sampling/sampling_logp_difference/mean": 0.01347966305911541, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 194.359375, "completions/mean_terminated_length": 194.359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.42587071657180786, "epoch": 0.32230392156862747, "frac_reward_zero_std": 0.75, "grad_norm": 1.0083109275275735, "kl": 0.010584939271211624, "learning_rate": 9.998700896584995e-07, "loss": -0.0051, "num_tokens": 8222880.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4777371883392334, "sampling/importance_sampling_ratio/mean": 0.9997612833976746, "sampling/importance_sampling_ratio/min": 0.606777012348175, "sampling/sampling_logp_difference/max": 0.49959397315979004, "sampling/sampling_logp_difference/mean": 0.016714511439204216, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3739390969276428, "epoch": 0.3235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01750328991956531, "kl": 0.007153388112783432, "learning_rate": 9.998533442222308e-07, "loss": 0.0001, "num_tokens": 8258328.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6946955919265747, "sampling/importance_sampling_ratio/mean": 0.9997098445892334, "sampling/importance_sampling_ratio/min": 0.6196833848953247, "sampling/sampling_logp_difference/max": 0.5275031328201294, "sampling/sampling_logp_difference/mean": 0.015847668051719666, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 212.203125, "completions/mean_terminated_length": 212.203125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3422619104385376, "epoch": 0.3247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.1888208198494439, "kl": 0.012853987514972687, "learning_rate": 9.9983558411534e-07, "loss": -0.0012, "num_tokens": 8287157.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.363533854484558, "sampling/importance_sampling_ratio/mean": 1.0000767707824707, "sampling/importance_sampling_ratio/min": 0.6381403803825378, "sampling/sampling_logp_difference/max": 0.44919705390930176, "sampling/sampling_logp_difference/mean": 0.015186481177806854, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 236.4375, "completions/mean_terminated_length": 236.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.36014288663864136, "epoch": 0.32598039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.021077412144935612, "kl": 0.009041134268045425, "learning_rate": 9.99816809373879e-07, "loss": 0.0001, "num_tokens": 8322577.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4862349033355713, "sampling/importance_sampling_ratio/mean": 0.9995858669281006, "sampling/importance_sampling_ratio/min": 0.6177365183830261, "sampling/sampling_logp_difference/max": 0.4816932678222656, "sampling/sampling_logp_difference/mean": 0.01599959097802639, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 237.65625, "completions/mean_terminated_length": 237.65625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4498145282268524, "epoch": 0.3272058823529412, "frac_reward_zero_std": 0.5, "grad_norm": 1.1473118671642435, "kl": 0.011207411997020245, "learning_rate": 9.99797020035959e-07, "loss": -0.0174, "num_tokens": 8357211.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.653535008430481, "sampling/importance_sampling_ratio/mean": 1.0001318454742432, "sampling/importance_sampling_ratio/min": 0.6112035512924194, "sampling/sampling_logp_difference/max": 0.5029153823852539, "sampling/sampling_logp_difference/mean": 0.01675787940621376, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 213.140625, "completions/mean_terminated_length": 213.140625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.30425506830215454, "epoch": 0.3284313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0320770130593047, "kl": 0.01154828630387783, "learning_rate": 9.997762161417517e-07, "loss": 0.0001, "num_tokens": 8388548.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.530159831047058, "sampling/importance_sampling_ratio/mean": 1.0002944469451904, "sampling/importance_sampling_ratio/min": 0.6133034825325012, "sampling/sampling_logp_difference/max": 0.4888954162597656, "sampling/sampling_logp_difference/mean": 0.015344790183007717, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 241.640625, "completions/mean_terminated_length": 241.640625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3790775537490845, "epoch": 0.32965686274509803, "frac_reward_zero_std": 0.5, "grad_norm": 2.93327834126621, "kl": 0.009812450036406517, "learning_rate": 9.997543977334873e-07, "loss": -0.0168, "num_tokens": 8429437.0, "reward": -0.25, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.9876813888549805, "sampling/importance_sampling_ratio/mean": 1.0003782510757446, "sampling/importance_sampling_ratio/min": 0.7020878195762634, "sampling/sampling_logp_difference/max": 0.6869688034057617, "sampling/sampling_logp_difference/mean": 0.014591362327337265, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 206.484375, "completions/mean_terminated_length": 206.484375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.38758742809295654, "epoch": 0.33088235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.026128193846614904, "kl": 0.010831797495484352, "learning_rate": 9.99731564855456e-07, "loss": 0.0001, "num_tokens": 8459324.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5065176486968994, "sampling/importance_sampling_ratio/mean": 0.9998651146888733, "sampling/importance_sampling_ratio/min": 0.6188350319862366, "sampling/sampling_logp_difference/max": 0.4799165725708008, "sampling/sampling_logp_difference/mean": 0.01775408536195755, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 278.890625, "completions/mean_terminated_length": 278.890625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2929910123348236, "epoch": 0.3321078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.019622187422360302, "kl": 0.011361843906342983, "learning_rate": 9.997077175540066e-07, "loss": 0.0001, "num_tokens": 8496661.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.534928798675537, "sampling/importance_sampling_ratio/mean": 1.00014066696167, "sampling/importance_sampling_ratio/min": 0.7192728519439697, "sampling/sampling_logp_difference/max": 0.4284839630126953, "sampling/sampling_logp_difference/mean": 0.012748262844979763, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 210.03125, "completions/mean_terminated_length": 210.03125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3396623134613037, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.8073554666654875, "kl": 0.013835672289133072, "learning_rate": 9.996828558775485e-07, "loss": -0.014, "num_tokens": 8531031.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9684661626815796, "sampling/importance_sampling_ratio/mean": 0.9999529123306274, "sampling/importance_sampling_ratio/min": 0.5509454011917114, "sampling/sampling_logp_difference/max": 0.6772546768188477, "sampling/sampling_logp_difference/mean": 0.014482136815786362, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.35685327649116516, "epoch": 0.33455882352941174, "frac_reward_zero_std": 0.5, "grad_norm": 1.1293923135993438, "kl": 0.01976989209651947, "learning_rate": 9.996569798765487e-07, "loss": -0.0089, "num_tokens": 8560023.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5511717796325684, "sampling/importance_sampling_ratio/mean": 1.0003318786621094, "sampling/importance_sampling_ratio/min": 0.6207550168037415, "sampling/sampling_logp_difference/max": 0.4768187999725342, "sampling/sampling_logp_difference/mean": 0.014719195663928986, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 202.34375, "completions/mean_terminated_length": 202.34375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3357786536216736, "epoch": 0.33578431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.04071909819670399, "kl": 0.01715000718832016, "learning_rate": 9.996300896035338e-07, "loss": 0.0002, "num_tokens": 8587821.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4186033010482788, "sampling/importance_sampling_ratio/mean": 0.9999063014984131, "sampling/importance_sampling_ratio/min": 0.618531346321106, "sampling/sampling_logp_difference/max": 0.4804074764251709, "sampling/sampling_logp_difference/mean": 0.015081456862390041, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 264.359375, "completions/mean_terminated_length": 264.359375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.36299800872802734, "epoch": 0.33700980392156865, "frac_reward_zero_std": 0.75, "grad_norm": 0.7979230304488981, "kl": 0.014299536123871803, "learning_rate": 9.996021851130896e-07, "loss": 0.0178, "num_tokens": 8622788.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6541705131530762, "sampling/importance_sampling_ratio/mean": 1.0003371238708496, "sampling/importance_sampling_ratio/min": 0.6095863580703735, "sampling/sampling_logp_difference/max": 0.5032997131347656, "sampling/sampling_logp_difference/mean": 0.014425665140151978, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 259.359375, "completions/mean_terminated_length": 259.359375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.37495720386505127, "epoch": 0.3382352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 1.0490685489830045, "kl": 0.014999349601566792, "learning_rate": 9.995732664618603e-07, "loss": 0.1337, "num_tokens": 8671851.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6221331357955933, "sampling/importance_sampling_ratio/mean": 1.000325322151184, "sampling/importance_sampling_ratio/min": 0.38583245873451233, "sampling/sampling_logp_difference/max": 0.9523520469665527, "sampling/sampling_logp_difference/mean": 0.015569565817713737, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 270.921875, "completions/mean_terminated_length": 270.921875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.39126330614089966, "epoch": 0.3394607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.6069572212361701, "kl": 0.019539829343557358, "learning_rate": 9.99543333708549e-07, "loss": -0.0189, "num_tokens": 8706006.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4988577365875244, "sampling/importance_sampling_ratio/mean": 0.9995763897895813, "sampling/importance_sampling_ratio/min": 0.6227314472198486, "sampling/sampling_logp_difference/max": 0.47363996505737305, "sampling/sampling_logp_difference/mean": 0.015419903211295605, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 194.4375, "completions/mean_terminated_length": 194.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.37649017572402954, "epoch": 0.34068627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.053443642162347454, "kl": 0.02782365307211876, "learning_rate": 9.995123869139176e-07, "loss": 0.0003, "num_tokens": 8731874.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4893184900283813, "sampling/importance_sampling_ratio/mean": 1.0006440877914429, "sampling/importance_sampling_ratio/min": 0.6645188331604004, "sampling/sampling_logp_difference/max": 0.4086921215057373, "sampling/sampling_logp_difference/mean": 0.01722247712314129, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5158754587173462, "epoch": 0.34191176470588236, "frac_reward_zero_std": 0.5, "grad_norm": 0.9434177905899298, "kl": 0.015656255185604095, "learning_rate": 9.994804261407854e-07, "loss": 0.0044, "num_tokens": 8775930.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6037347316741943, "sampling/importance_sampling_ratio/mean": 1.0002131462097168, "sampling/importance_sampling_ratio/min": 0.6202828288078308, "sampling/sampling_logp_difference/max": 0.47757983207702637, "sampling/sampling_logp_difference/mean": 0.017696373164653778, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4062265157699585, "epoch": 0.3431372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.8061154001343366, "kl": 0.025968432426452637, "learning_rate": 9.994474514540312e-07, "loss": -0.0021, "num_tokens": 8810638.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6301919221878052, "sampling/importance_sampling_ratio/mean": 0.9998388886451721, "sampling/importance_sampling_ratio/min": 0.7160124182701111, "sampling/sampling_logp_difference/max": 0.48869776725769043, "sampling/sampling_logp_difference/mean": 0.015875108540058136, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 251.265625, "completions/mean_terminated_length": 251.265625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3788436949253082, "epoch": 0.3443627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0342866177651477, "kl": 0.020827017724514008, "learning_rate": 9.994134629205917e-07, "loss": 0.0002, "num_tokens": 8843807.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006530284881592, "sampling/importance_sampling_ratio/min": 0.607172966003418, "sampling/sampling_logp_difference/max": 0.7796788215637207, "sampling/sampling_logp_difference/mean": 0.015283560380339622, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 238.96875, "completions/mean_terminated_length": 238.96875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3100021779537201, "epoch": 0.34558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0346732415875164, "kl": 0.021007856354117393, "learning_rate": 9.99378460609461e-07, "loss": 0.0002, "num_tokens": 8873645.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3672375679016113, "sampling/importance_sampling_ratio/mean": 1.0003538131713867, "sampling/importance_sampling_ratio/min": 0.6927396655082703, "sampling/sampling_logp_difference/max": 0.36710095405578613, "sampling/sampling_logp_difference/mean": 0.013294361531734467, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 267.90625, "completions/mean_terminated_length": 267.90625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5047875642776489, "epoch": 0.34681372549019607, "frac_reward_zero_std": 0.5, "grad_norm": 1.0632033845726123, "kl": 0.021810028702020645, "learning_rate": 9.993424445916922e-07, "loss": -0.0005, "num_tokens": 8908199.0, "reward": -0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.495995283126831, "sampling/importance_sampling_ratio/mean": 0.9999805688858032, "sampling/importance_sampling_ratio/min": 0.5697722434997559, "sampling/sampling_logp_difference/max": 0.5625185966491699, "sampling/sampling_logp_difference/mean": 0.017674528062343597, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 206.4375, "completions/mean_terminated_length": 206.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.38155049085617065, "epoch": 0.3480392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.04724550668822444, "kl": 0.023773398250341415, "learning_rate": 9.993054149403949e-07, "loss": 0.0002, "num_tokens": 8938131.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.451388955116272, "sampling/importance_sampling_ratio/mean": 0.9998237490653992, "sampling/importance_sampling_ratio/min": 0.6486294269561768, "sampling/sampling_logp_difference/max": 0.4328937530517578, "sampling/sampling_logp_difference/mean": 0.01579119637608528, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 225.71875, "completions/mean_terminated_length": 225.71875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.41737085580825806, "epoch": 0.3492647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.7651160195940809, "kl": 0.020508062094449997, "learning_rate": 9.992673717307372e-07, "loss": 0.0186, "num_tokens": 8969329.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4221943616867065, "sampling/importance_sampling_ratio/mean": 1.000159502029419, "sampling/importance_sampling_ratio/min": 0.6715134382247925, "sampling/sampling_logp_difference/max": 0.3982212543487549, "sampling/sampling_logp_difference/mean": 0.01604665070772171, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 335.9375, "completions/mean_terminated_length": 335.9375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4537578523159027, "epoch": 0.35049019607843135, "frac_reward_zero_std": 0.5, "grad_norm": 0.9120153238112306, "kl": 0.016356246545910835, "learning_rate": 9.992283150399446e-07, "loss": 0.0165, "num_tokens": 9010605.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4032405614852905, "sampling/importance_sampling_ratio/mean": 1.000108003616333, "sampling/importance_sampling_ratio/min": 0.623317301273346, "sampling/sampling_logp_difference/max": 0.4726996421813965, "sampling/sampling_logp_difference/mean": 0.015053913928568363, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 206.34375, "completions/mean_terminated_length": 206.34375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3335433602333069, "epoch": 0.35171568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.04277451366755491, "kl": 0.027477117255330086, "learning_rate": 9.991882449472994e-07, "loss": 0.0003, "num_tokens": 9037699.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4695838689804077, "sampling/importance_sampling_ratio/mean": 1.0002992153167725, "sampling/importance_sampling_ratio/min": 0.6815792918205261, "sampling/sampling_logp_difference/max": 0.384979248046875, "sampling/sampling_logp_difference/mean": 0.014059789478778839, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 267.671875, "completions/mean_terminated_length": 267.671875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3319460451602936, "epoch": 0.35294117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.03102749230494356, "kl": 0.021260742098093033, "learning_rate": 9.991471615341415e-07, "loss": 0.0002, "num_tokens": 9073310.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4058274030685425, "sampling/importance_sampling_ratio/mean": 1.0000005960464478, "sampling/importance_sampling_ratio/min": 0.722834050655365, "sampling/sampling_logp_difference/max": 0.3406260013580322, "sampling/sampling_logp_difference/mean": 0.012393254786729813, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4477394223213196, "epoch": 0.3541666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.702509219757032, "kl": 0.020046524703502655, "learning_rate": 9.991050648838675e-07, "loss": 0.0115, "num_tokens": 9116958.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6007344722747803, "sampling/importance_sampling_ratio/mean": 1.0000159740447998, "sampling/importance_sampling_ratio/min": 0.6227967143058777, "sampling/sampling_logp_difference/max": 0.47353506088256836, "sampling/sampling_logp_difference/mean": 0.015253344550728798, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 261.078125, "completions/mean_terminated_length": 261.078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.5179446935653687, "epoch": 0.3553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.245212156846121, "kl": 0.02527690678834915, "learning_rate": 9.990619550819312e-07, "loss": -0.0282, "num_tokens": 9154323.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.2850115299224854, "sampling/importance_sampling_ratio/mean": 1.0000402927398682, "sampling/importance_sampling_ratio/min": 0.6368844509124756, "sampling/sampling_logp_difference/max": 0.45116710662841797, "sampling/sampling_logp_difference/mean": 0.016047444194555283, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 226.859375, "completions/mean_terminated_length": 226.859375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.42619866132736206, "epoch": 0.35661764705882354, "frac_reward_zero_std": 0.5, "grad_norm": 0.9808054354022362, "kl": 0.035265617072582245, "learning_rate": 9.990178322158424e-07, "loss": 0.011, "num_tokens": 9184938.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.435062289237976, "sampling/importance_sampling_ratio/mean": 1.000767707824707, "sampling/importance_sampling_ratio/min": 0.6609618663787842, "sampling/sampling_logp_difference/max": 0.4140591621398926, "sampling/sampling_logp_difference/mean": 0.015793394297361374, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 265.328125, "completions/mean_terminated_length": 265.328125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4519370198249817, "epoch": 0.35784313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.8398408474879482, "kl": 0.024876803159713745, "learning_rate": 9.989726963751682e-07, "loss": -0.0675, "num_tokens": 9223935.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0003069639205933, "sampling/importance_sampling_ratio/min": 0.681201696395874, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.014846328645944595, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 280.984375, "completions/mean_terminated_length": 280.984375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3856172263622284, "epoch": 0.3590686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.9592771356076676, "kl": 0.02183361165225506, "learning_rate": 9.989265476515309e-07, "loss": 0.0004, "num_tokens": 9261886.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6392227411270142, "sampling/importance_sampling_ratio/mean": 0.9999226927757263, "sampling/importance_sampling_ratio/min": 0.5483723878860474, "sampling/sampling_logp_difference/max": 0.600800633430481, "sampling/sampling_logp_difference/mean": 0.012838400900363922, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 268.109375, "completions/mean_terminated_length": 268.109375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3931152820587158, "epoch": 0.3602941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.041518879366459226, "kl": 0.026410872116684914, "learning_rate": 9.9887938613861e-07, "loss": 0.0002, "num_tokens": 9300501.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4378784894943237, "sampling/importance_sampling_ratio/mean": 1.0005898475646973, "sampling/importance_sampling_ratio/min": 0.5106711983680725, "sampling/sampling_logp_difference/max": 0.6720293760299683, "sampling/sampling_logp_difference/mean": 0.01435445062816143, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 246.84375, "completions/mean_terminated_length": 246.84375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.5600332021713257, "epoch": 0.36151960784313725, "frac_reward_zero_std": 0.5, "grad_norm": 1.0508689845863735, "kl": 0.02834884449839592, "learning_rate": 9.988312119321402e-07, "loss": 0.0104, "num_tokens": 9331275.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4296252727508545, "sampling/importance_sampling_ratio/mean": 1.000156283378601, "sampling/importance_sampling_ratio/min": 0.6961706280708313, "sampling/sampling_logp_difference/max": 0.36216044425964355, "sampling/sampling_logp_difference/mean": 0.01765412464737892, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 236.4375, "completions/mean_terminated_length": 236.4375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.5023055076599121, "epoch": 0.3627450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.0406992424663923, "kl": 0.02572363056242466, "learning_rate": 9.98782025129912e-07, "loss": 0.0237, "num_tokens": 9363319.0, "reward": -0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4045171737670898, "sampling/importance_sampling_ratio/mean": 1.0002970695495605, "sampling/importance_sampling_ratio/min": 0.6565462946891785, "sampling/sampling_logp_difference/max": 0.4207620620727539, "sampling/sampling_logp_difference/mean": 0.016197221353650093, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 244.09375, "completions/mean_terminated_length": 244.09375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3690158724784851, "epoch": 0.3639705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.1297929680290169, "kl": 0.02254669740796089, "learning_rate": 9.987318258317715e-07, "loss": -0.0009, "num_tokens": 9394173.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5062799453735352, "sampling/importance_sampling_ratio/mean": 0.9997458457946777, "sampling/importance_sampling_ratio/min": 0.48347559571266174, "sampling/sampling_logp_difference/max": 0.7267544269561768, "sampling/sampling_logp_difference/mean": 0.013869008049368858, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 214.109375, "completions/mean_terminated_length": 214.109375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4830111861228943, "epoch": 0.36519607843137253, "frac_reward_zero_std": 0.5, "grad_norm": 1.1628297212132652, "kl": 0.02464178390800953, "learning_rate": 9.986806141396205e-07, "loss": 0.0074, "num_tokens": 9425220.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3627346754074097, "sampling/importance_sampling_ratio/mean": 1.0003234148025513, "sampling/importance_sampling_ratio/min": 0.6627572178840637, "sampling/sampling_logp_difference/max": 0.41134655475616455, "sampling/sampling_logp_difference/mean": 0.016353249549865723, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 199.421875, "completions/mean_terminated_length": 199.421875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.45876309275627136, "epoch": 0.36642156862745096, "frac_reward_zero_std": 0.75, "grad_norm": 0.9559940374465532, "kl": 0.02711273357272148, "learning_rate": 9.986283901574149e-07, "loss": 0.0228, "num_tokens": 9454127.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.8181120157241821, "sampling/importance_sampling_ratio/mean": 1.000248908996582, "sampling/importance_sampling_ratio/min": 0.6377933621406555, "sampling/sampling_logp_difference/max": 0.5977985858917236, "sampling/sampling_logp_difference/mean": 0.016439270228147507, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.446953684091568, "epoch": 0.36764705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.9389807093353264, "kl": 0.02266453579068184, "learning_rate": 9.985751539911664e-07, "loss": -0.0135, "num_tokens": 9485459.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5480645895004272, "sampling/importance_sampling_ratio/mean": 1.0001177787780762, "sampling/importance_sampling_ratio/min": 0.7234283089637756, "sampling/sampling_logp_difference/max": 0.43700551986694336, "sampling/sampling_logp_difference/mean": 0.015539245679974556, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 241.84375, "completions/mean_terminated_length": 241.84375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4896894693374634, "epoch": 0.36887254901960786, "frac_reward_zero_std": 0.5, "grad_norm": 1.2370543993976322, "kl": 0.017642877995967865, "learning_rate": 9.985209057489408e-07, "loss": 0.0249, "num_tokens": 9519673.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.608892560005188, "sampling/importance_sampling_ratio/mean": 1.0001373291015625, "sampling/importance_sampling_ratio/min": 0.6986686587333679, "sampling/sampling_logp_difference/max": 0.47554612159729004, "sampling/sampling_logp_difference/mean": 0.01651783287525177, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 209.046875, "completions/mean_terminated_length": 209.046875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.45735979080200195, "epoch": 0.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.933172548990412, "kl": 0.020198900252580643, "learning_rate": 9.98465645540859e-07, "loss": -0.0088, "num_tokens": 9549532.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.413565993309021, "sampling/importance_sampling_ratio/mean": 0.9999147057533264, "sampling/importance_sampling_ratio/min": 0.6831852793693542, "sampling/sampling_logp_difference/max": 0.3809892237186432, "sampling/sampling_logp_difference/mean": 0.01622258871793747, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 214.171875, "completions/mean_terminated_length": 214.171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.45174339413642883, "epoch": 0.3713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02551175006893155, "kl": 0.0194728821516037, "learning_rate": 9.984093734790954e-07, "loss": 0.0002, "num_tokens": 9583463.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.418564796447754, "sampling/importance_sampling_ratio/mean": 0.9997199773788452, "sampling/importance_sampling_ratio/min": 0.662344753742218, "sampling/sampling_logp_difference/max": 0.41196906566619873, "sampling/sampling_logp_difference/mean": 0.016074877232313156, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 216.8125, "completions/mean_terminated_length": 216.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4414713978767395, "epoch": 0.37254901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.3309676504186365, "kl": 0.015547841787338257, "learning_rate": 9.983520896778788e-07, "loss": 0.0445, "num_tokens": 9622219.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.639810562133789, "sampling/importance_sampling_ratio/mean": 1.0012476444244385, "sampling/importance_sampling_ratio/min": 0.49714693427085876, "sampling/sampling_logp_difference/max": 0.6988697052001953, "sampling/sampling_logp_difference/mean": 0.016059590503573418, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4350166916847229, "epoch": 0.3737745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.2038721713622835, "kl": 0.020988553762435913, "learning_rate": 9.982937942534917e-07, "loss": -0.0323, "num_tokens": 9650875.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5613172054290771, "sampling/importance_sampling_ratio/mean": 1.0004253387451172, "sampling/importance_sampling_ratio/min": 0.7304356694221497, "sampling/sampling_logp_difference/max": 0.4455298185348511, "sampling/sampling_logp_difference/mean": 0.01565871387720108, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 186.765625, "completions/mean_terminated_length": 186.765625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.36607080698013306, "epoch": 0.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.02285386951927824, "kl": 0.01576017215847969, "learning_rate": 9.982344873242701e-07, "loss": 0.0001, "num_tokens": 9678460.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003432035446167, "sampling/importance_sampling_ratio/min": 0.5685401558876038, "sampling/sampling_logp_difference/max": 0.7368893623352051, "sampling/sampling_logp_difference/mean": 0.015342392027378082, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 187.296875, "completions/mean_terminated_length": 187.296875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.36356499791145325, "epoch": 0.3762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.023063440600103766, "kl": 0.016244564205408096, "learning_rate": 9.981741690106034e-07, "loss": 0.0002, "num_tokens": 9709743.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8790457248687744, "sampling/importance_sampling_ratio/mean": 0.9994124174118042, "sampling/importance_sampling_ratio/min": 0.6109957695007324, "sampling/sampling_logp_difference/max": 0.6307640075683594, "sampling/sampling_logp_difference/mean": 0.015056891366839409, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 212.359375, "completions/mean_terminated_length": 212.359375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.32220736145973206, "epoch": 0.37745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.015985507237566882, "kl": 0.013575403019785881, "learning_rate": 9.981128394349337e-07, "loss": 0.0001, "num_tokens": 9740470.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4561007022857666, "sampling/importance_sampling_ratio/mean": 0.9995553493499756, "sampling/importance_sampling_ratio/min": 0.6257768273353577, "sampling/sampling_logp_difference/max": 0.4687614440917969, "sampling/sampling_logp_difference/mean": 0.01345759630203247, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 188.265625, "completions/mean_terminated_length": 188.265625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.37388908863067627, "epoch": 0.3786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02378861837544395, "kl": 0.018611183390021324, "learning_rate": 9.980504987217566e-07, "loss": 0.0002, "num_tokens": 9767383.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5393632650375366, "sampling/importance_sampling_ratio/mean": 0.9997674226760864, "sampling/importance_sampling_ratio/min": 0.48920756578445435, "sampling/sampling_logp_difference/max": 0.7149684429168701, "sampling/sampling_logp_difference/mean": 0.01607775315642357, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 225.015625, "completions/mean_terminated_length": 225.015625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4121472239494324, "epoch": 0.3799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.018721720571636468, "kl": 0.018510695546865463, "learning_rate": 9.979871469976195e-07, "loss": 0.0002, "num_tokens": 9803384.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5797837972640991, "sampling/importance_sampling_ratio/mean": 1.0007458925247192, "sampling/importance_sampling_ratio/min": 0.6805833578109741, "sampling/sampling_logp_difference/max": 0.4572880268096924, "sampling/sampling_logp_difference/mean": 0.015674971044063568, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 181.40625, "completions/mean_terminated_length": 181.40625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.42017343640327454, "epoch": 0.38112745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.02361696383934872, "kl": 0.017823034897446632, "learning_rate": 9.979227843911224e-07, "loss": 0.0002, "num_tokens": 9837234.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004793405532837, "sampling/importance_sampling_ratio/min": 0.6095881462097168, "sampling/sampling_logp_difference/max": 0.8857212066650391, "sampling/sampling_logp_difference/mean": 0.01524389162659645, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 222.171875, "completions/mean_terminated_length": 222.171875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3985661268234253, "epoch": 0.38235294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.019965078795185862, "kl": 0.01637699082493782, "learning_rate": 9.978574110329172e-07, "loss": 0.0002, "num_tokens": 9874541.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3650709390640259, "sampling/importance_sampling_ratio/mean": 1.000051736831665, "sampling/importance_sampling_ratio/min": 0.659122884273529, "sampling/sampling_logp_difference/max": 0.41684532165527344, "sampling/sampling_logp_difference/mean": 0.015085380524396896, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.37935322523117065, "epoch": 0.38357843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.01919782391360173, "kl": 0.01602853089570999, "learning_rate": 9.977910270557078e-07, "loss": 0.0002, "num_tokens": 9910253.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.44670832157135, "sampling/importance_sampling_ratio/mean": 0.9998260736465454, "sampling/importance_sampling_ratio/min": 0.6405990123748779, "sampling/sampling_logp_difference/max": 0.44535160064697266, "sampling/sampling_logp_difference/mean": 0.01509149931371212, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 184.734375, "completions/mean_terminated_length": 184.734375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.41767391562461853, "epoch": 0.38480392156862747, "frac_reward_zero_std": 1.0, "grad_norm": 0.023667493417852292, "kl": 0.02086430788040161, "learning_rate": 9.977236325942497e-07, "loss": 0.0002, "num_tokens": 9942540.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5250335931777954, "sampling/importance_sampling_ratio/mean": 0.9999814629554749, "sampling/importance_sampling_ratio/min": 0.6375662684440613, "sampling/sampling_logp_difference/max": 0.45009708404541016, "sampling/sampling_logp_difference/mean": 0.015893224626779556, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.431037575006485, "epoch": 0.3860294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.019758936015441354, "kl": 0.017636163160204887, "learning_rate": 9.97655227785349e-07, "loss": 0.0002, "num_tokens": 9973980.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6710313558578491, "sampling/importance_sampling_ratio/mean": 1.0001165866851807, "sampling/importance_sampling_ratio/min": 0.529052197933197, "sampling/sampling_logp_difference/max": 0.6366682052612305, "sampling/sampling_logp_difference/mean": 0.015019385144114494, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 200.671875, "completions/mean_terminated_length": 200.671875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.39899998903274536, "epoch": 0.3872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.021624545982917426, "kl": 0.018345296382904053, "learning_rate": 9.975858127678633e-07, "loss": 0.0002, "num_tokens": 10006103.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5327506065368652, "sampling/importance_sampling_ratio/mean": 0.9999449253082275, "sampling/importance_sampling_ratio/min": 0.5874239802360535, "sampling/sampling_logp_difference/max": 0.5320084095001221, "sampling/sampling_logp_difference/mean": 0.014733761548995972, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 207.515625, "completions/mean_terminated_length": 207.515625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.42619267106056213, "epoch": 0.38848039215686275, "frac_reward_zero_std": 0.75, "grad_norm": 0.7446097893106156, "kl": 0.021541431546211243, "learning_rate": 9.975153876827007e-07, "loss": 0.0084, "num_tokens": 10037224.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4943957328796387, "sampling/importance_sampling_ratio/mean": 1.000223159790039, "sampling/importance_sampling_ratio/min": 0.5525302290916443, "sampling/sampling_logp_difference/max": 0.5932471752166748, "sampling/sampling_logp_difference/mean": 0.01602860540151596, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2825194001197815, "epoch": 0.3897058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.022026282512501064, "kl": 0.016508134081959724, "learning_rate": 9.974439526728196e-07, "loss": 0.0002, "num_tokens": 10067248.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3455226421356201, "sampling/importance_sampling_ratio/mean": 0.9993760585784912, "sampling/importance_sampling_ratio/min": 0.6548749804496765, "sampling/sampling_logp_difference/max": 0.42331087589263916, "sampling/sampling_logp_difference/mean": 0.012481987476348877, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 184.34375, "completions/mean_terminated_length": 184.34375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.38290348649024963, "epoch": 0.3909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.02523643531495567, "kl": 0.019662799313664436, "learning_rate": 9.973715078832286e-07, "loss": 0.0002, "num_tokens": 10095030.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3655370473861694, "sampling/importance_sampling_ratio/mean": 1.0000557899475098, "sampling/importance_sampling_ratio/min": 0.7608779668807983, "sampling/sampling_logp_difference/max": 0.31154775619506836, "sampling/sampling_logp_difference/mean": 0.014857003465294838, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 174.203125, "completions/mean_terminated_length": 174.203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.333204448223114, "epoch": 0.39215686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.024831705119286045, "kl": 0.025300707668066025, "learning_rate": 9.97298053460986e-07, "loss": 0.0002, "num_tokens": 10125155.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.508152723312378, "sampling/importance_sampling_ratio/mean": 1.000302791595459, "sampling/importance_sampling_ratio/min": 0.3722725808620453, "sampling/sampling_logp_difference/max": 0.9881290197372437, "sampling/sampling_logp_difference/mean": 0.014936529099941254, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2664256691932678, "epoch": 0.39338235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.018332434252923264, "kl": 0.01409104559570551, "learning_rate": 9.972235895552e-07, "loss": 0.0001, "num_tokens": 10149203.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6294610500335693, "sampling/importance_sampling_ratio/mean": 1.0005340576171875, "sampling/importance_sampling_ratio/min": 0.744778037071228, "sampling/sampling_logp_difference/max": 0.4882493019104004, "sampling/sampling_logp_difference/mean": 0.013930534943938255, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 186.515625, "completions/mean_terminated_length": 186.515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.40586355328559875, "epoch": 0.3946078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.02064638541886519, "kl": 0.020959127694368362, "learning_rate": 9.971481163170269e-07, "loss": 0.0002, "num_tokens": 10182420.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4098576307296753, "sampling/importance_sampling_ratio/mean": 0.9997353553771973, "sampling/importance_sampling_ratio/min": 0.6208242774009705, "sampling/sampling_logp_difference/max": 0.47670722007751465, "sampling/sampling_logp_difference/mean": 0.015238940715789795, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3228031396865845, "epoch": 0.3958333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.024632588296384848, "kl": 0.018601976335048676, "learning_rate": 9.97071633899673e-07, "loss": 0.0002, "num_tokens": 10209444.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6561650037765503, "sampling/importance_sampling_ratio/mean": 1.0004572868347168, "sampling/importance_sampling_ratio/min": 0.014291869476437569, "sampling/sampling_logp_difference/max": 4.2480645179748535, "sampling/sampling_logp_difference/mean": 0.014869332313537598, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.35080116987228394, "epoch": 0.39705882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.024710164205598798, "kl": 0.018571950495243073, "learning_rate": 9.969941424583925e-07, "loss": 0.0002, "num_tokens": 10243040.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5731862783432007, "sampling/importance_sampling_ratio/mean": 1.0000193119049072, "sampling/importance_sampling_ratio/min": 0.6066892147064209, "sampling/sampling_logp_difference/max": 0.4997386932373047, "sampling/sampling_logp_difference/mean": 0.014886989258229733, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 165.265625, "completions/mean_terminated_length": 165.265625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.34618815779685974, "epoch": 0.39828431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.024235228506903773, "kl": 0.020537182688713074, "learning_rate": 9.969156421504887e-07, "loss": 0.0002, "num_tokens": 10272577.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9551204442977905, "sampling/importance_sampling_ratio/mean": 1.0000271797180176, "sampling/importance_sampling_ratio/min": 0.6173092722892761, "sampling/sampling_logp_difference/max": 0.6704518795013428, "sampling/sampling_logp_difference/mean": 0.016670338809490204, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 152.578125, "completions/mean_terminated_length": 152.578125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.34937143325805664, "epoch": 0.39950980392156865, "frac_reward_zero_std": 0.75, "grad_norm": 1.309998561306225, "kl": 0.036862075328826904, "learning_rate": 9.968361331353116e-07, "loss": 0.0184, "num_tokens": 10296006.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.338853359222412, "sampling/importance_sampling_ratio/mean": 1.0007468461990356, "sampling/importance_sampling_ratio/min": 0.4755912721157074, "sampling/sampling_logp_difference/max": 0.7431964874267578, "sampling/sampling_logp_difference/mean": 0.015171946957707405, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3997516632080078, "epoch": 0.4007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.03787665420392663, "kl": 0.02181834913790226, "learning_rate": 9.9675561557426e-07, "loss": 0.0002, "num_tokens": 10328042.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0001437664031982, "sampling/importance_sampling_ratio/min": 0.7130023837089539, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.016936684027314186, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 187.078125, "completions/mean_terminated_length": 187.078125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.385425329208374, "epoch": 0.4019607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.026178763304403684, "kl": 0.021473130211234093, "learning_rate": 9.966740896307791e-07, "loss": 0.0002, "num_tokens": 10359887.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3876692056655884, "sampling/importance_sampling_ratio/mean": 1.0007128715515137, "sampling/importance_sampling_ratio/min": 0.7448554039001465, "sampling/sampling_logp_difference/max": 0.3276255130767822, "sampling/sampling_logp_difference/mean": 0.013736354187130928, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 176.765625, "completions/mean_terminated_length": 176.765625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.30048856139183044, "epoch": 0.40318627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.021039406246671645, "kl": 0.017125394195318222, "learning_rate": 9.965915554703613e-07, "loss": 0.0002, "num_tokens": 10385312.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.470345139503479, "sampling/importance_sampling_ratio/mean": 0.9999263882637024, "sampling/importance_sampling_ratio/min": 0.6326866149902344, "sampling/sampling_logp_difference/max": 0.457780122756958, "sampling/sampling_logp_difference/mean": 0.0145049337297678, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3266918957233429, "epoch": 0.40441176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.022633059020672665, "kl": 0.015120496973395348, "learning_rate": 9.965080132605461e-07, "loss": 0.0001, "num_tokens": 10413112.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5924938917160034, "sampling/importance_sampling_ratio/mean": 1.0009623765945435, "sampling/importance_sampling_ratio/min": 0.6915599703788757, "sampling/sampling_logp_difference/max": 0.4653012752532959, "sampling/sampling_logp_difference/mean": 0.01513220090419054, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 191.328125, "completions/mean_terminated_length": 191.328125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.40041443705558777, "epoch": 0.4056372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.030299461024198816, "kl": 0.017881443724036217, "learning_rate": 9.964234631709185e-07, "loss": 0.0002, "num_tokens": 10445837.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4754209518432617, "sampling/importance_sampling_ratio/mean": 1.0003015995025635, "sampling/importance_sampling_ratio/min": 0.6172264218330383, "sampling/sampling_logp_difference/max": 0.48251938819885254, "sampling/sampling_logp_difference/mean": 0.016025379300117493, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.30025070905685425, "epoch": 0.4068627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.020141493915539384, "kl": 0.015826791524887085, "learning_rate": 9.963379053731102e-07, "loss": 0.0001, "num_tokens": 10474757.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.478416919708252, "sampling/importance_sampling_ratio/mean": 0.999915599822998, "sampling/importance_sampling_ratio/min": 0.6468645334243774, "sampling/sampling_logp_difference/max": 0.43561840057373047, "sampling/sampling_logp_difference/mean": 0.01432250440120697, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 157.40625, "completions/mean_terminated_length": 157.40625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.33026736974716187, "epoch": 0.40808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.022846751025488637, "kl": 0.01617838256061077, "learning_rate": 9.96251340040798e-07, "loss": 0.0002, "num_tokens": 10501903.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4032772779464722, "sampling/importance_sampling_ratio/mean": 1.000197172164917, "sampling/importance_sampling_ratio/min": 0.6706700921058655, "sampling/sampling_logp_difference/max": 0.3994779586791992, "sampling/sampling_logp_difference/mean": 0.015687666833400726, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.35955965518951416, "epoch": 0.40931372549019607, "frac_reward_zero_std": 1.0, "grad_norm": 0.027193936788723276, "kl": 0.012852571904659271, "learning_rate": 9.96163767349704e-07, "loss": 0.0001, "num_tokens": 10534649.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6321485042572021, "sampling/importance_sampling_ratio/mean": 0.9997122287750244, "sampling/importance_sampling_ratio/min": 0.553581714630127, "sampling/sampling_logp_difference/max": 0.5913459062576294, "sampling/sampling_logp_difference/mean": 0.01636279746890068, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 177.984375, "completions/mean_terminated_length": 177.984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3666573166847229, "epoch": 0.4105392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.027948584748888235, "kl": 0.015932895243167877, "learning_rate": 9.96075187477595e-07, "loss": 0.0002, "num_tokens": 10563736.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5915312767028809, "sampling/importance_sampling_ratio/mean": 1.0004807710647583, "sampling/importance_sampling_ratio/min": 0.7182236313819885, "sampling/sampling_logp_difference/max": 0.46469664573669434, "sampling/sampling_logp_difference/mean": 0.015503861010074615, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 152.984375, "completions/mean_terminated_length": 152.984375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.27830806374549866, "epoch": 0.4117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.03193201309027661, "kl": 0.014680283144116402, "learning_rate": 9.959856006042828e-07, "loss": 0.0001, "num_tokens": 10591527.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5989633798599243, "sampling/importance_sampling_ratio/mean": 1.0004345178604126, "sampling/importance_sampling_ratio/min": 0.6358773708343506, "sampling/sampling_logp_difference/max": 0.46935558319091797, "sampling/sampling_logp_difference/mean": 0.013306674547493458, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3773849606513977, "epoch": 0.41299019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.04655533106697762, "kl": 0.023683395236730576, "learning_rate": 9.95895006911623e-07, "loss": 0.0002, "num_tokens": 10624563.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5617190599441528, "sampling/importance_sampling_ratio/mean": 1.0000473260879517, "sampling/importance_sampling_ratio/min": 0.6068368554115295, "sampling/sampling_logp_difference/max": 0.499495267868042, "sampling/sampling_logp_difference/mean": 0.01680750772356987, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2574771046638489, "epoch": 0.41421568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.026002327792873828, "kl": 0.015414551831781864, "learning_rate": 9.95803406583515e-07, "loss": 0.0002, "num_tokens": 10648855.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5451250076293945, "sampling/importance_sampling_ratio/mean": 1.000146508216858, "sampling/importance_sampling_ratio/min": 0.6684028506278992, "sampling/sampling_logp_difference/max": 0.4351048469543457, "sampling/sampling_logp_difference/mean": 0.01264483667910099, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 154.65625, "completions/mean_terminated_length": 154.65625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3974250853061676, "epoch": 0.41544117647058826, "frac_reward_zero_std": 0.75, "grad_norm": 1.334713356953007, "kl": 0.03008582815527916, "learning_rate": 9.957107998059018e-07, "loss": -0.0329, "num_tokens": 10674385.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6329152584075928, "sampling/importance_sampling_ratio/mean": 1.0004063844680786, "sampling/importance_sampling_ratio/min": 0.6714426875114441, "sampling/sampling_logp_difference/max": 0.49036693572998047, "sampling/sampling_logp_difference/mean": 0.018222328275442123, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.38176703453063965, "epoch": 0.4166666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.018426472295113922, "kl": 0.014108864590525627, "learning_rate": 9.956171867667693e-07, "loss": 0.0001, "num_tokens": 10710585.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.820453405380249, "sampling/importance_sampling_ratio/mean": 0.9997619390487671, "sampling/importance_sampling_ratio/min": 0.5687109231948853, "sampling/sampling_logp_difference/max": 0.5990855693817139, "sampling/sampling_logp_difference/mean": 0.01673772558569908, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 120.515625, "completions/mean_terminated_length": 120.515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.320795476436615, "epoch": 0.4178921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.03431113829074164, "kl": 0.02109440788626671, "learning_rate": 9.955225676561459e-07, "loss": 0.0002, "num_tokens": 10731034.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4996675252914429, "sampling/importance_sampling_ratio/mean": 1.0003200769424438, "sampling/importance_sampling_ratio/min": 0.6079776287078857, "sampling/sampling_logp_difference/max": 0.497617244720459, "sampling/sampling_logp_difference/mean": 0.01737777516245842, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 175.828125, "completions/mean_terminated_length": 175.828125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2854253053665161, "epoch": 0.41911764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.021964472680569124, "kl": 0.012618260458111763, "learning_rate": 9.954269426661022e-07, "loss": 0.0001, "num_tokens": 10766447.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4091451168060303, "sampling/importance_sampling_ratio/mean": 0.999445378780365, "sampling/importance_sampling_ratio/min": 0.6348267197608948, "sampling/sampling_logp_difference/max": 0.4544031620025635, "sampling/sampling_logp_difference/mean": 0.01373360026627779, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3288782835006714, "epoch": 0.42034313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.020066321352772268, "kl": 0.012781183235347271, "learning_rate": 9.953303119907513e-07, "loss": 0.0001, "num_tokens": 10795631.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6213734149932861, "sampling/importance_sampling_ratio/mean": 0.9997131824493408, "sampling/importance_sampling_ratio/min": 0.711065948009491, "sampling/sampling_logp_difference/max": 0.4832735061645508, "sampling/sampling_logp_difference/mean": 0.013720612972974777, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 198.28125, "completions/mean_terminated_length": 198.28125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3548140525817871, "epoch": 0.4215686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.030670964298762787, "kl": 0.017154211178421974, "learning_rate": 9.952326758262472e-07, "loss": 0.0002, "num_tokens": 10827793.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.40550696849823, "sampling/importance_sampling_ratio/mean": 1.0001940727233887, "sampling/importance_sampling_ratio/min": 0.681526243686676, "sampling/sampling_logp_difference/max": 0.383420467376709, "sampling/sampling_logp_difference/mean": 0.013858755119144917, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.24323208630084991, "epoch": 0.4227941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.016511666124535206, "kl": 0.00972837209701538, "learning_rate": 9.95134034370785e-07, "loss": 0.0001, "num_tokens": 10852577.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6776573657989502, "sampling/importance_sampling_ratio/mean": 1.0005297660827637, "sampling/importance_sampling_ratio/min": 0.6209533214569092, "sampling/sampling_logp_difference/max": 0.5173983573913574, "sampling/sampling_logp_difference/mean": 0.01275689247995615, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3620036840438843, "epoch": 0.42401960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.025874137301171645, "kl": 0.01582196354866028, "learning_rate": 9.950343878246009e-07, "loss": 0.0001, "num_tokens": 10890145.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3052634000778198, "sampling/importance_sampling_ratio/mean": 1.0003299713134766, "sampling/importance_sampling_ratio/min": 0.6577809453010559, "sampling/sampling_logp_difference/max": 0.4188833236694336, "sampling/sampling_logp_difference/mean": 0.015653233975172043, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 186.359375, "completions/mean_terminated_length": 186.359375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.37454864382743835, "epoch": 0.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.027276294431496238, "kl": 0.01751563511788845, "learning_rate": 9.949337363899708e-07, "loss": 0.0002, "num_tokens": 10917944.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.496646761894226, "sampling/importance_sampling_ratio/mean": 1.0000839233398438, "sampling/importance_sampling_ratio/min": 0.6151504516601562, "sampling/sampling_logp_difference/max": 0.4858884811401367, "sampling/sampling_logp_difference/mean": 0.016659047454595566, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 191.21875, "completions/mean_terminated_length": 191.21875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.402404248714447, "epoch": 0.4264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.02102957159592185, "kl": 0.013128215447068214, "learning_rate": 9.948320802712107e-07, "loss": 0.0001, "num_tokens": 10945462.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2981833219528198, "sampling/importance_sampling_ratio/mean": 1.0000042915344238, "sampling/importance_sampling_ratio/min": 0.6452677845954895, "sampling/sampling_logp_difference/max": 0.43808984756469727, "sampling/sampling_logp_difference/mean": 0.01635223627090454, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.30062228441238403, "epoch": 0.42769607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.01587966621073377, "kl": 0.012505080550909042, "learning_rate": 9.947294196746762e-07, "loss": 0.0001, "num_tokens": 10977438.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6356579065322876, "sampling/importance_sampling_ratio/mean": 1.0002219676971436, "sampling/importance_sampling_ratio/min": 0.6478670239448547, "sampling/sampling_logp_difference/max": 0.49204516410827637, "sampling/sampling_logp_difference/mean": 0.01302589476108551, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 174.703125, "completions/mean_terminated_length": 174.703125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2981036305427551, "epoch": 0.42892156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.01907727053640788, "kl": 0.011256780475378036, "learning_rate": 9.946257548087619e-07, "loss": 0.0001, "num_tokens": 11004475.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4305928945541382, "sampling/importance_sampling_ratio/mean": 1.000324010848999, "sampling/importance_sampling_ratio/min": 0.6029837131500244, "sampling/sampling_logp_difference/max": 0.5058650970458984, "sampling/sampling_logp_difference/mean": 0.012895691208541393, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.29742923378944397, "epoch": 0.43014705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.013485970659333622, "kl": 0.009871533140540123, "learning_rate": 9.945210858839008e-07, "loss": 0.0001, "num_tokens": 11032547.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5784273147583008, "sampling/importance_sampling_ratio/mean": 0.9997193217277527, "sampling/importance_sampling_ratio/min": 0.6298893690109253, "sampling/sampling_logp_difference/max": 0.46221113204956055, "sampling/sampling_logp_difference/mean": 0.013206500560045242, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 253.390625, "completions/mean_terminated_length": 253.390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.33359986543655396, "epoch": 0.43137254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.01343310501689489, "kl": 0.00943258497864008, "learning_rate": 9.944154131125642e-07, "loss": 0.0001, "num_tokens": 11066620.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9546880722045898, "sampling/importance_sampling_ratio/mean": 1.0005977153778076, "sampling/importance_sampling_ratio/min": 0.6223419308662415, "sampling/sampling_logp_difference/max": 0.6702306270599365, "sampling/sampling_logp_difference/mean": 0.013398932293057442, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3880694508552551, "epoch": 0.4325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.012760186849020817, "kl": 0.009503064677119255, "learning_rate": 9.94308736709261e-07, "loss": 0.0001, "num_tokens": 11103452.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5074957609176636, "sampling/importance_sampling_ratio/mean": 1.0005195140838623, "sampling/importance_sampling_ratio/min": 0.6032688617706299, "sampling/sampling_logp_difference/max": 0.50539231300354, "sampling/sampling_logp_difference/mean": 0.01578535884618759, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 196.46875, "completions/mean_terminated_length": 196.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3384985625743866, "epoch": 0.4338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.018559612037949657, "kl": 0.011546864174306393, "learning_rate": 9.94201056890538e-07, "loss": 0.0001, "num_tokens": 11133242.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4395679235458374, "sampling/importance_sampling_ratio/mean": 1.0003654956817627, "sampling/importance_sampling_ratio/min": 0.703487753868103, "sampling/sampling_logp_difference/max": 0.36434292793273926, "sampling/sampling_logp_difference/mean": 0.015101935714483261, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3752257227897644, "epoch": 0.43504901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.020685644073571794, "kl": 0.012361589819192886, "learning_rate": 9.940923738749777e-07, "loss": 0.0001, "num_tokens": 11165290.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4209574460983276, "sampling/importance_sampling_ratio/mean": 0.999468207359314, "sampling/importance_sampling_ratio/min": 0.535356342792511, "sampling/sampling_logp_difference/max": 0.6248226165771484, "sampling/sampling_logp_difference/mean": 0.014410814270377159, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 216.15625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4338979423046112, "epoch": 0.4362745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.013385996902813196, "kl": 0.010194092988967896, "learning_rate": 9.939826878832003e-07, "loss": 0.0001, "num_tokens": 11194596.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5806500911712646, "sampling/importance_sampling_ratio/mean": 1.0002678632736206, "sampling/importance_sampling_ratio/min": 0.6268720626831055, "sampling/sampling_logp_difference/max": 0.467012882232666, "sampling/sampling_logp_difference/mean": 0.017618417739868164, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 221.03125, "completions/mean_terminated_length": 221.03125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.40737539529800415, "epoch": 0.4375, "frac_reward_zero_std": 1.0, "grad_norm": 0.01643896331280935, "kl": 0.01152168121188879, "learning_rate": 9.938719991378613e-07, "loss": 0.0001, "num_tokens": 11230726.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5277706384658813, "sampling/importance_sampling_ratio/mean": 1.0003275871276855, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.4238095283508301, "sampling/sampling_logp_difference/mean": 0.015043385326862335, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.38174504041671753, "epoch": 0.4387254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.013168161127015283, "kl": 0.008941764943301678, "learning_rate": 9.937603078636518e-07, "loss": 0.0001, "num_tokens": 11271110.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.675676941871643, "sampling/importance_sampling_ratio/mean": 1.0006225109100342, "sampling/importance_sampling_ratio/min": 0.5020617842674255, "sampling/sampling_logp_difference/max": 0.6890320777893066, "sampling/sampling_logp_difference/mean": 0.016321398317813873, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 160.40625, "completions/mean_terminated_length": 160.40625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3537214398384094, "epoch": 0.43995098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.018349232248083134, "kl": 0.014196186326444149, "learning_rate": 9.936476142872977e-07, "loss": 0.0001, "num_tokens": 11294928.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3459526300430298, "sampling/importance_sampling_ratio/mean": 1.0004903078079224, "sampling/importance_sampling_ratio/min": 0.6110498309135437, "sampling/sampling_logp_difference/max": 0.49257683753967285, "sampling/sampling_logp_difference/mean": 0.01667492464184761, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 200.3125, "completions/mean_terminated_length": 200.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.42494967579841614, "epoch": 0.4411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018074421995676333, "kl": 0.012515506707131863, "learning_rate": 9.935339186375603e-07, "loss": 0.0001, "num_tokens": 11328580.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4254281520843506, "sampling/importance_sampling_ratio/mean": 0.9997589588165283, "sampling/importance_sampling_ratio/min": 0.6549595594406128, "sampling/sampling_logp_difference/max": 0.42318177223205566, "sampling/sampling_logp_difference/mean": 0.017285365611314774, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 278.140625, "completions/mean_terminated_length": 278.140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.41641536355018616, "epoch": 0.4424019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.009015352703555118, "kl": 0.006647173315286636, "learning_rate": 9.934192211452344e-07, "loss": 0.0001, "num_tokens": 11372749.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4884394407272339, "sampling/importance_sampling_ratio/mean": 0.9997484087944031, "sampling/importance_sampling_ratio/min": 0.629547655582428, "sampling/sampling_logp_difference/max": 0.4627537727355957, "sampling/sampling_logp_difference/mean": 0.01442030444741249, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 247.359375, "completions/mean_terminated_length": 247.359375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.35651469230651855, "epoch": 0.44362745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.011251895085230473, "kl": 0.008346650749444962, "learning_rate": 9.933035220431487e-07, "loss": 0.0001, "num_tokens": 11406612.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.597931146621704, "sampling/importance_sampling_ratio/mean": 1.0006805658340454, "sampling/importance_sampling_ratio/min": 0.6154104471206665, "sampling/sampling_logp_difference/max": 0.48546576499938965, "sampling/sampling_logp_difference/mean": 0.013788014650344849, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 199.109375, "completions/mean_terminated_length": 199.109375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4216683506965637, "epoch": 0.44485294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.017696291196703195, "kl": 0.011852515861392021, "learning_rate": 9.931868215661647e-07, "loss": 0.0001, "num_tokens": 11435675.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5903968811035156, "sampling/importance_sampling_ratio/mean": 0.9999369978904724, "sampling/importance_sampling_ratio/min": 0.6195716857910156, "sampling/sampling_logp_difference/max": 0.478726863861084, "sampling/sampling_logp_difference/mean": 0.01574837602674961, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 199.15625, "completions/mean_terminated_length": 199.15625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3079066276550293, "epoch": 0.44607843137254904, "frac_reward_zero_std": 1.0, "grad_norm": 0.014264489431204608, "kl": 0.008546917699277401, "learning_rate": 9.930691199511773e-07, "loss": 0.0001, "num_tokens": 11461461.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.596198558807373, "sampling/importance_sampling_ratio/mean": 0.999616801738739, "sampling/importance_sampling_ratio/min": 0.6264073848724365, "sampling/sampling_logp_difference/max": 0.4677543640136719, "sampling/sampling_logp_difference/mean": 0.01515682227909565, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 211.71875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3385094106197357, "epoch": 0.44730392156862747, "frac_reward_zero_std": 1.0, "grad_norm": 0.01982976435718639, "kl": 0.011798565275967121, "learning_rate": 9.929504174371136e-07, "loss": 0.0001, "num_tokens": 11492899.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.608918309211731, "sampling/importance_sampling_ratio/mean": 0.9996719360351562, "sampling/importance_sampling_ratio/min": 0.6418868899345398, "sampling/sampling_logp_difference/max": 0.47556209564208984, "sampling/sampling_logp_difference/mean": 0.013919688761234283, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35065576434135437, "epoch": 0.4485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01633451405040199, "kl": 0.010045424103736877, "learning_rate": 9.928307142649314e-07, "loss": 0.0001, "num_tokens": 11517439.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5628693103790283, "sampling/importance_sampling_ratio/mean": 1.0001769065856934, "sampling/importance_sampling_ratio/min": 0.6255244612693787, "sampling/sampling_logp_difference/max": 0.4691648483276367, "sampling/sampling_logp_difference/mean": 0.01635180599987507, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 177.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.33547794818878174, "epoch": 0.4497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.019456791341792457, "kl": 0.012170180678367615, "learning_rate": 9.927100106776212e-07, "loss": 0.0001, "num_tokens": 11543607.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5203474760055542, "sampling/importance_sampling_ratio/mean": 0.9998880624771118, "sampling/importance_sampling_ratio/min": 0.6530148983001709, "sampling/sampling_logp_difference/max": 0.42615532875061035, "sampling/sampling_logp_difference/mean": 0.014844020828604698, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 172.515625, "completions/mean_terminated_length": 172.515625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4139445126056671, "epoch": 0.45098039215686275, "frac_reward_zero_std": 1.0, "grad_norm": 0.01405004602057797, "kl": 0.010405349545180798, "learning_rate": 9.925883069202034e-07, "loss": 0.0001, "num_tokens": 11573224.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3822776079177856, "sampling/importance_sampling_ratio/mean": 0.9990941286087036, "sampling/importance_sampling_ratio/min": 0.6073088645935059, "sampling/sampling_logp_difference/max": 0.4987177848815918, "sampling/sampling_logp_difference/mean": 0.01969890296459198, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.419656366109848, "epoch": 0.4522058823529412, "frac_reward_zero_std": 0.75, "grad_norm": 0.9301273362582549, "kl": 0.010022681206464767, "learning_rate": 9.92465603239729e-07, "loss": -0.0052, "num_tokens": 11606044.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4288361072540283, "sampling/importance_sampling_ratio/mean": 0.9995024800300598, "sampling/importance_sampling_ratio/min": 0.6839989423751831, "sampling/sampling_logp_difference/max": 0.37979888916015625, "sampling/sampling_logp_difference/mean": 0.01661648415029049, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3726652264595032, "epoch": 0.4534313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.013424665837497438, "kl": 0.009182492271065712, "learning_rate": 9.923418998852787e-07, "loss": 0.0001, "num_tokens": 11632482.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4375884532928467, "sampling/importance_sampling_ratio/mean": 1.0001118183135986, "sampling/importance_sampling_ratio/min": 0.5601235032081604, "sampling/sampling_logp_difference/max": 0.5795979499816895, "sampling/sampling_logp_difference/mean": 0.017543703317642212, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 192.09375, "completions/mean_terminated_length": 192.09375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.35122162103652954, "epoch": 0.45465686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.015861282409031972, "kl": 0.009136844426393509, "learning_rate": 9.922171971079622e-07, "loss": 0.0001, "num_tokens": 11661432.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.329856276512146, "sampling/importance_sampling_ratio/mean": 1.0001051425933838, "sampling/importance_sampling_ratio/min": 0.6267584562301636, "sampling/sampling_logp_difference/max": 0.4671940803527832, "sampling/sampling_logp_difference/mean": 0.01532377116382122, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 187.515625, "completions/mean_terminated_length": 187.515625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3589586019515991, "epoch": 0.45588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.018064289896045117, "kl": 0.010359864681959152, "learning_rate": 9.920914951609186e-07, "loss": 0.0001, "num_tokens": 11691545.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.505644679069519, "sampling/importance_sampling_ratio/mean": 0.9999890327453613, "sampling/importance_sampling_ratio/min": 0.6101424098014832, "sampling/sampling_logp_difference/max": 0.4940629005432129, "sampling/sampling_logp_difference/mean": 0.015188705176115036, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 171.90625, "completions/mean_terminated_length": 171.90625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.33146971464157104, "epoch": 0.4571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01367888723644528, "kl": 0.00988725759088993, "learning_rate": 9.919647942993147e-07, "loss": 0.0001, "num_tokens": 11720611.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3563395738601685, "sampling/importance_sampling_ratio/mean": 1.0000863075256348, "sampling/importance_sampling_ratio/min": 0.5471782684326172, "sampling/sampling_logp_difference/max": 0.6029806137084961, "sampling/sampling_logp_difference/mean": 0.016517682000994682, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 229.578125, "completions/mean_terminated_length": 229.578125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3728431463241577, "epoch": 0.4583333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 1.181335991355015, "kl": 0.008848130702972412, "learning_rate": 9.918370947803455e-07, "loss": -0.0115, "num_tokens": 11758824.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4291181564331055, "sampling/importance_sampling_ratio/mean": 0.9998779296875, "sampling/importance_sampling_ratio/min": 0.6423242688179016, "sampling/sampling_logp_difference/max": 0.44266200065612793, "sampling/sampling_logp_difference/mean": 0.014619621448218822, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 221.59375, "completions/mean_terminated_length": 221.59375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.297640323638916, "epoch": 0.45955882352941174, "frac_reward_zero_std": 1.0, "grad_norm": 0.010795928871486645, "kl": 0.008283271454274654, "learning_rate": 9.917083968632326e-07, "loss": 0.0001, "num_tokens": 11788430.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.507724642753601, "sampling/importance_sampling_ratio/mean": 0.9998579025268555, "sampling/importance_sampling_ratio/min": 0.6445702910423279, "sampling/sampling_logp_difference/max": 0.4391714334487915, "sampling/sampling_logp_difference/mean": 0.013588126748800278, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 212.171875, "completions/mean_terminated_length": 212.171875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.41798919439315796, "epoch": 0.46078431372549017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0345409906844456, "kl": 0.016850750893354416, "learning_rate": 9.915787008092246e-07, "loss": 0.0002, "num_tokens": 11824425.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6266127824783325, "sampling/importance_sampling_ratio/mean": 1.0001428127288818, "sampling/importance_sampling_ratio/min": 0.541965663433075, "sampling/sampling_logp_difference/max": 0.6125526428222656, "sampling/sampling_logp_difference/mean": 0.016644926741719246, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 159.953125, "completions/mean_terminated_length": 159.953125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3381967842578888, "epoch": 0.46200980392156865, "frac_reward_zero_std": 1.0, "grad_norm": 0.015688429278366736, "kl": 0.010663250461220741, "learning_rate": 9.914480068815961e-07, "loss": 0.0001, "num_tokens": 11854566.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0000499486923218, "sampling/importance_sampling_ratio/min": 0.5363562703132629, "sampling/sampling_logp_difference/max": 0.6229566335678101, "sampling/sampling_logp_difference/mean": 0.015087351202964783, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.313782274723053, "epoch": 0.4632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.012745304946639297, "kl": 0.00800234079360962, "learning_rate": 9.913163153456482e-07, "loss": 0.0001, "num_tokens": 11881374.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6307555437088013, "sampling/importance_sampling_ratio/mean": 1.00034761428833, "sampling/importance_sampling_ratio/min": 0.6225945353507996, "sampling/sampling_logp_difference/max": 0.4890434741973877, "sampling/sampling_logp_difference/mean": 0.014235056936740875, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 249.546875, "completions/mean_terminated_length": 249.546875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.32097193598747253, "epoch": 0.4644607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.009852273938805821, "kl": 0.005863037426024675, "learning_rate": 9.91183626468706e-07, "loss": 0.0001, "num_tokens": 11916305.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.434583306312561, "sampling/importance_sampling_ratio/mean": 1.0002024173736572, "sampling/importance_sampling_ratio/min": 0.6957331895828247, "sampling/sampling_logp_difference/max": 0.3627890348434448, "sampling/sampling_logp_difference/mean": 0.01312682218849659, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 248.1875, "completions/mean_terminated_length": 248.1875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4050544798374176, "epoch": 0.46568627450980393, "frac_reward_zero_std": 1.0, "grad_norm": 0.016587628782540324, "kl": 0.008186952210962772, "learning_rate": 9.910499405201193e-07, "loss": 0.0001, "num_tokens": 11951053.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4755452871322632, "sampling/importance_sampling_ratio/mean": 1.0002648830413818, "sampling/importance_sampling_ratio/min": 0.6298418641090393, "sampling/sampling_logp_difference/max": 0.46228647232055664, "sampling/sampling_logp_difference/mean": 0.01567826420068741, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4453149139881134, "epoch": 0.46691176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.8653692004783323, "kl": 0.01589384488761425, "learning_rate": 9.909152577712625e-07, "loss": -0.0128, "num_tokens": 11977769.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3567479848861694, "sampling/importance_sampling_ratio/mean": 0.9998579025268555, "sampling/importance_sampling_ratio/min": 0.6224786043167114, "sampling/sampling_logp_difference/max": 0.474045991897583, "sampling/sampling_logp_difference/mean": 0.019120272248983383, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24300457537174225, "epoch": 0.4681372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.01401523700410047, "kl": 0.00933674443513155, "learning_rate": 9.907795784955326e-07, "loss": 0.0001, "num_tokens": 12008637.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.404148817062378, "sampling/importance_sampling_ratio/mean": 0.9994839429855347, "sampling/importance_sampling_ratio/min": 0.7045887112617493, "sampling/sampling_logp_difference/max": 0.3501410186290741, "sampling/sampling_logp_difference/mean": 0.011331282556056976, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 246.21875, "completions/mean_terminated_length": 246.21875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3826541304588318, "epoch": 0.4693627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.8061377842091467, "kl": 0.009127024561166763, "learning_rate": 9.906429029683504e-07, "loss": 0.0324, "num_tokens": 12041547.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3910638093948364, "sampling/importance_sampling_ratio/mean": 0.9997413158416748, "sampling/importance_sampling_ratio/min": 0.6038206815719604, "sampling/sampling_logp_difference/max": 0.5044779777526855, "sampling/sampling_logp_difference/mean": 0.014083616435527802, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 221.765625, "completions/mean_terminated_length": 221.765625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3471091687679291, "epoch": 0.47058823529411764, "frac_reward_zero_std": 0.75, "grad_norm": 0.8550076145586865, "kl": 0.010176371783018112, "learning_rate": 9.90505231467158e-07, "loss": -0.0033, "num_tokens": 12078172.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5535870790481567, "sampling/importance_sampling_ratio/mean": 0.9999591708183289, "sampling/importance_sampling_ratio/min": 0.6087507009506226, "sampling/sampling_logp_difference/max": 0.49634647369384766, "sampling/sampling_logp_difference/mean": 0.014170932583510876, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 243.6875, "completions/mean_terminated_length": 243.6875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.32598116993904114, "epoch": 0.47181372549019607, "frac_reward_zero_std": 1.0, "grad_norm": 0.017329786007415797, "kl": 0.01097575668245554, "learning_rate": 9.903665642714204e-07, "loss": 0.0001, "num_tokens": 12111400.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.510948896408081, "sampling/importance_sampling_ratio/mean": 0.9996094703674316, "sampling/importance_sampling_ratio/min": 0.5932686924934387, "sampling/sampling_logp_difference/max": 0.5221078395843506, "sampling/sampling_logp_difference/mean": 0.012972252443432808, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 221.65625, "completions/mean_terminated_length": 221.65625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3300212621688843, "epoch": 0.4730392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.012453816009873477, "kl": 0.008042341098189354, "learning_rate": 9.90226901662623e-07, "loss": 0.0001, "num_tokens": 12141218.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5358397960662842, "sampling/importance_sampling_ratio/mean": 1.0001249313354492, "sampling/importance_sampling_ratio/min": 0.2011961191892624, "sampling/sampling_logp_difference/max": 1.6034750938415527, "sampling/sampling_logp_difference/mean": 0.017671309411525726, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 214.640625, "completions/mean_terminated_length": 214.640625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35195472836494446, "epoch": 0.4742647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.016225673081601708, "kl": 0.009677985683083534, "learning_rate": 9.900862439242718e-07, "loss": 0.0001, "num_tokens": 12172651.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4994847774505615, "sampling/importance_sampling_ratio/mean": 0.9996469616889954, "sampling/importance_sampling_ratio/min": 0.5029210448265076, "sampling/sampling_logp_difference/max": 0.6873221397399902, "sampling/sampling_logp_difference/mean": 0.015036087483167648, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.39445775747299194, "epoch": 0.47549019607843135, "frac_reward_zero_std": 1.0, "grad_norm": 0.01883372708158617, "kl": 0.009737227112054825, "learning_rate": 9.899445913418935e-07, "loss": 0.0001, "num_tokens": 12207075.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.554342269897461, "sampling/importance_sampling_ratio/mean": 1.0000633001327515, "sampling/importance_sampling_ratio/min": 0.6536708474159241, "sampling/sampling_logp_difference/max": 0.4410524368286133, "sampling/sampling_logp_difference/mean": 0.015788394957780838, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 213.484375, "completions/mean_terminated_length": 213.484375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.31441351771354675, "epoch": 0.47671568627450983, "frac_reward_zero_std": 1.0, "grad_norm": 0.02090207683091764, "kl": 0.010597823187708855, "learning_rate": 9.898019442030337e-07, "loss": 0.0001, "num_tokens": 12235378.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.9693632125854492, "sampling/importance_sampling_ratio/mean": 1.0002236366271973, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.6777102947235107, "sampling/sampling_logp_difference/mean": 0.015006947331130505, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.43768465518951416, "epoch": 0.47794117647058826, "frac_reward_zero_std": 1.0, "grad_norm": 0.022032048994189212, "kl": 0.01550086960196495, "learning_rate": 9.89658302797257e-07, "loss": 0.0001, "num_tokens": 12260194.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5772755146026611, "sampling/importance_sampling_ratio/mean": 1.0005762577056885, "sampling/importance_sampling_ratio/min": 0.6577732563018799, "sampling/sampling_logp_difference/max": 0.45569896697998047, "sampling/sampling_logp_difference/mean": 0.018796630203723907, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 192.140625, "completions/mean_terminated_length": 192.140625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.340323269367218, "epoch": 0.4791666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.01645434437036542, "kl": 0.009912341833114624, "learning_rate": 9.895136674161464e-07, "loss": 0.0001, "num_tokens": 12289787.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6626406908035278, "sampling/importance_sampling_ratio/mean": 1.0005561113357544, "sampling/importance_sampling_ratio/min": 0.7020365595817566, "sampling/sampling_logp_difference/max": 0.5084071159362793, "sampling/sampling_logp_difference/mean": 0.01474553719162941, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 213.734375, "completions/mean_terminated_length": 213.734375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.48565277457237244, "epoch": 0.4803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.0367646587482953, "kl": 0.014473862014710903, "learning_rate": 9.893680383533024e-07, "loss": -0.002, "num_tokens": 12323146.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 0.9999999403953552, "sampling/importance_sampling_ratio/min": 0.6321954727172852, "sampling/sampling_logp_difference/max": 0.4585566520690918, "sampling/sampling_logp_difference/mean": 0.019393594935536385, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 237.09375, "completions/mean_terminated_length": 237.09375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3610077202320099, "epoch": 0.48161764705882354, "frac_reward_zero_std": 1.0, "grad_norm": 0.023214835044770902, "kl": 0.011279666796326637, "learning_rate": 9.892214159043433e-07, "loss": 0.0001, "num_tokens": 12358272.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5280091762542725, "sampling/importance_sampling_ratio/mean": 1.00017249584198, "sampling/importance_sampling_ratio/min": 0.6086266040802002, "sampling/sampling_logp_difference/max": 0.4965503215789795, "sampling/sampling_logp_difference/mean": 0.01587025076150894, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 172.328125, "completions/mean_terminated_length": 172.328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3241961598396301, "epoch": 0.48284313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.027468222552452255, "kl": 0.011704692617058754, "learning_rate": 9.890738003669027e-07, "loss": 0.0001, "num_tokens": 12384885.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.405652642250061, "sampling/importance_sampling_ratio/mean": 0.9996992349624634, "sampling/importance_sampling_ratio/min": 0.6394681334495544, "sampling/sampling_logp_difference/max": 0.44711852073669434, "sampling/sampling_logp_difference/mean": 0.015274315141141415, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.37035873532295227, "epoch": 0.4840686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.017759354131219995, "kl": 0.011212404817342758, "learning_rate": 9.889251920406312e-07, "loss": 0.0001, "num_tokens": 12413861.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4374908208847046, "sampling/importance_sampling_ratio/mean": 0.9999295473098755, "sampling/importance_sampling_ratio/min": 0.5719456076622009, "sampling/sampling_logp_difference/max": 0.5587114095687866, "sampling/sampling_logp_difference/mean": 0.015050732530653477, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2652778625488281, "epoch": 0.4852941176470588, "frac_reward_zero_std": 1.0, "grad_norm": 0.01741930717185592, "kl": 0.008691548369824886, "learning_rate": 9.887755912271942e-07, "loss": 0.0001, "num_tokens": 12446029.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.372191071510315, "sampling/importance_sampling_ratio/mean": 0.9995167255401611, "sampling/importance_sampling_ratio/min": 0.6255383491516113, "sampling/sampling_logp_difference/max": 0.4691426753997803, "sampling/sampling_logp_difference/mean": 0.012035621330142021, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 216.15625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.33375227451324463, "epoch": 0.48651960784313725, "frac_reward_zero_std": 1.0, "grad_norm": 0.02535238968033679, "kl": 0.011992005631327629, "learning_rate": 9.886249982302718e-07, "loss": 0.0001, "num_tokens": 12477911.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.620891809463501, "sampling/importance_sampling_ratio/mean": 0.9997732639312744, "sampling/importance_sampling_ratio/min": 0.6178363561630249, "sampling/sampling_logp_difference/max": 0.48297643661499023, "sampling/sampling_logp_difference/mean": 0.014461209997534752, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3546869158744812, "epoch": 0.4877450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8350613153136783, "kl": 0.016289301216602325, "learning_rate": 9.884734133555585e-07, "loss": -0.0084, "num_tokens": 12509975.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8640707731246948, "sampling/importance_sampling_ratio/mean": 1.000771164894104, "sampling/importance_sampling_ratio/min": 0.6371414065361023, "sampling/sampling_logp_difference/max": 0.6227626800537109, "sampling/sampling_logp_difference/mean": 0.015145364217460155, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 217.359375, "completions/mean_terminated_length": 217.359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3628026247024536, "epoch": 0.4889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0230960872634931, "kl": 0.010546230711042881, "learning_rate": 9.883208369107617e-07, "loss": 0.0001, "num_tokens": 12539774.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4766346216201782, "sampling/importance_sampling_ratio/mean": 0.9998975396156311, "sampling/importance_sampling_ratio/min": 0.5483725070953369, "sampling/sampling_logp_difference/max": 0.6008005142211914, "sampling/sampling_logp_difference/mean": 0.015969838947057724, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3053513169288635, "epoch": 0.49019607843137253, "frac_reward_zero_std": 1.0, "grad_norm": 0.040026206678680114, "kl": 0.01512863114476204, "learning_rate": 9.88167269205602e-07, "loss": 0.0001, "num_tokens": 12564590.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4661747217178345, "sampling/importance_sampling_ratio/mean": 0.9999059438705444, "sampling/importance_sampling_ratio/min": 0.6298375129699707, "sampling/sampling_logp_difference/max": 0.4622933864593506, "sampling/sampling_logp_difference/mean": 0.01319966372102499, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 231.4375, "completions/mean_terminated_length": 231.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.338267982006073, "epoch": 0.49142156862745096, "frac_reward_zero_std": 1.0, "grad_norm": 0.01592680116951382, "kl": 0.009573228657245636, "learning_rate": 9.880127105518122e-07, "loss": 0.0001, "num_tokens": 12597114.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4357717037200928, "sampling/importance_sampling_ratio/mean": 0.999531626701355, "sampling/importance_sampling_ratio/min": 0.45780494809150696, "sampling/sampling_logp_difference/max": 0.781312108039856, "sampling/sampling_logp_difference/mean": 0.01465204730629921, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 177.46875, "completions/mean_terminated_length": 177.46875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.42952173948287964, "epoch": 0.49264705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 1.0164482068868528, "kl": 0.01893451064825058, "learning_rate": 9.878571612631363e-07, "loss": -0.013, "num_tokens": 12623736.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5284698009490967, "sampling/importance_sampling_ratio/mean": 1.0002241134643555, "sampling/importance_sampling_ratio/min": 0.601417601108551, "sampling/sampling_logp_difference/max": 0.5084657669067383, "sampling/sampling_logp_difference/mean": 0.019010312855243683, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 192.96875, "completions/mean_terminated_length": 192.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3650784492492676, "epoch": 0.49387254901960786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0365249853735248, "kl": 0.0104802455753088, "learning_rate": 9.8770062165533e-07, "loss": 0.0001, "num_tokens": 12658470.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5172513723373413, "sampling/importance_sampling_ratio/mean": 0.999741792678833, "sampling/importance_sampling_ratio/min": 0.6829821467399597, "sampling/sampling_logp_difference/max": 0.4169003963470459, "sampling/sampling_logp_difference/mean": 0.015502199530601501, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 161.78125, "completions/mean_terminated_length": 161.78125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35916945338249207, "epoch": 0.4950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.10011804646684393, "kl": 0.02148270606994629, "learning_rate": 9.875430920461583e-07, "loss": 0.0002, "num_tokens": 12686120.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.454472541809082, "sampling/importance_sampling_ratio/mean": 0.9993026256561279, "sampling/importance_sampling_ratio/min": 0.47207364439964294, "sampling/sampling_logp_difference/max": 0.7506203651428223, "sampling/sampling_logp_difference/mean": 0.016054194420576096, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.31307125091552734, "epoch": 0.4963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03287256451829378, "kl": 0.01108582690358162, "learning_rate": 9.873845727553965e-07, "loss": 0.0001, "num_tokens": 12720904.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.479647159576416, "sampling/importance_sampling_ratio/mean": 0.9998552203178406, "sampling/importance_sampling_ratio/min": 0.6676144599914551, "sampling/sampling_logp_difference/max": 0.40404438972473145, "sampling/sampling_logp_difference/mean": 0.013567068614065647, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 181.296875, "completions/mean_terminated_length": 181.296875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4192981421947479, "epoch": 0.49754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.022305365019105922, "kl": 0.013657962903380394, "learning_rate": 9.87225064104829e-07, "loss": 0.0001, "num_tokens": 12747435.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4031422138214111, "sampling/importance_sampling_ratio/mean": 0.9997045397758484, "sampling/importance_sampling_ratio/min": 0.6355483531951904, "sampling/sampling_logp_difference/max": 0.45326709747314453, "sampling/sampling_logp_difference/mean": 0.01757827028632164, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 226.03125, "completions/mean_terminated_length": 226.03125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.30763596296310425, "epoch": 0.4987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.023753100882734985, "kl": 0.01026175543665886, "learning_rate": 9.870645664182476e-07, "loss": 0.0001, "num_tokens": 12778301.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5588346719741821, "sampling/importance_sampling_ratio/mean": 0.9999215602874756, "sampling/importance_sampling_ratio/min": 0.6088090538978577, "sampling/sampling_logp_difference/max": 0.49625062942504883, "sampling/sampling_logp_difference/mean": 0.013904592022299767, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 170.84375, "completions/mean_terminated_length": 170.84375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3431023955345154, "epoch": 0.5, "frac_reward_zero_std": 1.0, "grad_norm": 0.024057986878457543, "kl": 0.013347751460969448, "learning_rate": 9.86903080021453e-07, "loss": 0.0001, "num_tokens": 12807619.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5784333944320679, "sampling/importance_sampling_ratio/mean": 0.999455988407135, "sampling/importance_sampling_ratio/min": 0.5910114645957947, "sampling/sampling_logp_difference/max": 0.5259199142456055, "sampling/sampling_logp_difference/mean": 0.01554157119244337, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 241.53125, "completions/mean_terminated_length": 241.53125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.35906076431274414, "epoch": 0.5012254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.014658573627134372, "kl": 0.009751342236995697, "learning_rate": 9.867406052422523e-07, "loss": 0.0001, "num_tokens": 12845237.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9751145839691162, "sampling/importance_sampling_ratio/mean": 1.0009266138076782, "sampling/importance_sampling_ratio/min": 0.6303215622901917, "sampling/sampling_logp_difference/max": 0.680626392364502, "sampling/sampling_logp_difference/mean": 0.014839326031506062, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 253.65625, "completions/mean_terminated_length": 253.65625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.37862318754196167, "epoch": 0.5024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.7770336075388821, "kl": 0.011146817356348038, "learning_rate": 9.865771424104587e-07, "loss": -0.0127, "num_tokens": 12878559.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.475413203239441, "sampling/importance_sampling_ratio/mean": 0.9998399019241333, "sampling/importance_sampling_ratio/min": 0.6268531680107117, "sampling/sampling_logp_difference/max": 0.4670429229736328, "sampling/sampling_logp_difference/mean": 0.014631716534495354, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 251.28125, "completions/mean_terminated_length": 251.28125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.42295581102371216, "epoch": 0.5036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5690485828250321, "kl": 0.010090619325637817, "learning_rate": 9.864126918578919e-07, "loss": -0.0016, "num_tokens": 12912721.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2837777137756348, "sampling/importance_sampling_ratio/mean": 1.000241994857788, "sampling/importance_sampling_ratio/min": 0.5151903033256531, "sampling/sampling_logp_difference/max": 0.6632189750671387, "sampling/sampling_logp_difference/mean": 0.015532649122178555, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 175.734375, "completions/mean_terminated_length": 175.734375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.41181814670562744, "epoch": 0.5049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.01816848416533495, "kl": 0.01359694916754961, "learning_rate": 9.862472539183755e-07, "loss": 0.0001, "num_tokens": 12939456.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5215433835983276, "sampling/importance_sampling_ratio/mean": 1.0003886222839355, "sampling/importance_sampling_ratio/min": 0.5691208839416504, "sampling/sampling_logp_difference/max": 0.5636624097824097, "sampling/sampling_logp_difference/mean": 0.017363913357257843, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3904797434806824, "epoch": 0.5061274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.018690357847311847, "kl": 0.013147925958037376, "learning_rate": 9.860808289277385e-07, "loss": 0.0001, "num_tokens": 12967208.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4828284978866577, "sampling/importance_sampling_ratio/mean": 1.0003409385681152, "sampling/importance_sampling_ratio/min": 0.6065059304237366, "sampling/sampling_logp_difference/max": 0.5000407695770264, "sampling/sampling_logp_difference/mean": 0.01624198630452156, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 195.1875, "completions/mean_terminated_length": 195.1875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.30729755759239197, "epoch": 0.5073529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.0649258163660247, "kl": 0.010252762585878372, "learning_rate": 9.859134172238128e-07, "loss": -0.0148, "num_tokens": 12994340.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.366351842880249, "sampling/importance_sampling_ratio/mean": 1.000044345855713, "sampling/importance_sampling_ratio/min": 0.6018763184547424, "sampling/sampling_logp_difference/max": 0.5077033042907715, "sampling/sampling_logp_difference/mean": 0.012773063965141773, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 182.234375, "completions/mean_terminated_length": 182.234375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.32861125469207764, "epoch": 0.508578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.016148268535324674, "kl": 0.010346755385398865, "learning_rate": 9.857450191464337e-07, "loss": 0.0001, "num_tokens": 13021955.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.527945876121521, "sampling/importance_sampling_ratio/mean": 0.9997206926345825, "sampling/importance_sampling_ratio/min": 0.4597110450267792, "sampling/sampling_logp_difference/max": 0.777157187461853, "sampling/sampling_logp_difference/mean": 0.01393515057861805, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 236.140625, "completions/mean_terminated_length": 236.140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.437416136264801, "epoch": 0.5098039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7352635655684305, "kl": 0.008323276415467262, "learning_rate": 9.855756350374386e-07, "loss": 0.0104, "num_tokens": 13065692.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.7883018255233765, "sampling/importance_sampling_ratio/mean": 0.9998971819877625, "sampling/importance_sampling_ratio/min": 0.5383554697036743, "sampling/sampling_logp_difference/max": 0.6192362308502197, "sampling/sampling_logp_difference/mean": 0.017036153003573418, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 222.28125, "completions/mean_terminated_length": 222.28125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.43742769956588745, "epoch": 0.5110294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.9121850816629584, "kl": 0.010742575861513615, "learning_rate": 9.854052652406665e-07, "loss": 0.0112, "num_tokens": 13100894.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.557647466659546, "sampling/importance_sampling_ratio/mean": 1.0001201629638672, "sampling/importance_sampling_ratio/min": 0.5368156433105469, "sampling/sampling_logp_difference/max": 0.6221005916595459, "sampling/sampling_logp_difference/mean": 0.01638263650238514, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 194.453125, "completions/mean_terminated_length": 194.453125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3751828074455261, "epoch": 0.5122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.0845766431346806, "kl": 0.01224912516772747, "learning_rate": 9.852339101019572e-07, "loss": -0.0416, "num_tokens": 13130507.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.8485195636749268, "sampling/importance_sampling_ratio/mean": 0.9996465444564819, "sampling/importance_sampling_ratio/min": 0.5415288805961609, "sampling/sampling_logp_difference/max": 0.6143851280212402, "sampling/sampling_logp_difference/mean": 0.015233149752020836, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 203.484375, "completions/mean_terminated_length": 203.484375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.45921579003334045, "epoch": 0.5134803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.3310712480782592, "kl": 0.012076552957296371, "learning_rate": 9.85061569969151e-07, "loss": -0.03, "num_tokens": 13164778.0, "reward": -0.03125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4747954607009888, "sampling/importance_sampling_ratio/mean": 0.99909508228302, "sampling/importance_sampling_ratio/min": 0.6616628766059875, "sampling/sampling_logp_difference/max": 0.41299915313720703, "sampling/sampling_logp_difference/mean": 0.016422288492321968, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 208.6875, "completions/mean_terminated_length": 208.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3692478835582733, "epoch": 0.5147058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02518174999162162, "kl": 0.01171743031591177, "learning_rate": 9.848882451920875e-07, "loss": 0.0001, "num_tokens": 13196502.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.461438536643982, "sampling/importance_sampling_ratio/mean": 0.9998584985733032, "sampling/importance_sampling_ratio/min": 0.6298533082008362, "sampling/sampling_logp_difference/max": 0.4622683525085449, "sampling/sampling_logp_difference/mean": 0.014890164136886597, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 166.703125, "completions/mean_terminated_length": 166.703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.36562401056289673, "epoch": 0.5159313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.1242874525286544, "kl": 0.015466064214706421, "learning_rate": 9.847139361226046e-07, "loss": 0.0056, "num_tokens": 13221891.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6145374774932861, "sampling/importance_sampling_ratio/mean": 1.000512719154358, "sampling/importance_sampling_ratio/min": 0.6652603149414062, "sampling/sampling_logp_difference/max": 0.479048490524292, "sampling/sampling_logp_difference/mean": 0.01558524090796709, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.42874494194984436, "epoch": 0.5171568627450981, "frac_reward_zero_std": 0.5, "grad_norm": 1.4318856251018424, "kl": 0.022168340161442757, "learning_rate": 9.84538643114539e-07, "loss": -0.0212, "num_tokens": 13245975.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000335454940796, "sampling/importance_sampling_ratio/min": 0.6910908818244934, "sampling/sampling_logp_difference/max": 1.2831158638000488, "sampling/sampling_logp_difference/mean": 0.01832837238907814, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 179.109375, "completions/mean_terminated_length": 179.109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3738434314727783, "epoch": 0.5183823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.1146246273592262, "kl": 0.01757657900452614, "learning_rate": 9.843623665237242e-07, "loss": 0.0145, "num_tokens": 13276766.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4726742506027222, "sampling/importance_sampling_ratio/mean": 0.999600887298584, "sampling/importance_sampling_ratio/min": 0.4992694556713104, "sampling/sampling_logp_difference/max": 0.6946094036102295, "sampling/sampling_logp_difference/mean": 0.015279553830623627, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 202.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.32806265354156494, "epoch": 0.5196078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.021441570679582925, "kl": 0.01352194044739008, "learning_rate": 9.841851067079908e-07, "loss": 0.0001, "num_tokens": 13306210.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4367003440856934, "sampling/importance_sampling_ratio/mean": 1.0002729892730713, "sampling/importance_sampling_ratio/min": 0.5905013084411621, "sampling/sampling_logp_difference/max": 0.5267834663391113, "sampling/sampling_logp_difference/mean": 0.014424655586481094, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 214.65625, "completions/mean_terminated_length": 214.65625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.41232380270957947, "epoch": 0.5208333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.9098461013976387, "kl": 0.020982306450605392, "learning_rate": 9.840068640271647e-07, "loss": 0.0202, "num_tokens": 13334908.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.2892931699752808, "sampling/importance_sampling_ratio/mean": 1.00017511844635, "sampling/importance_sampling_ratio/min": 0.6300297975540161, "sampling/sampling_logp_difference/max": 0.4619882106781006, "sampling/sampling_logp_difference/mean": 0.015482231974601746, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 181.015625, "completions/mean_terminated_length": 181.015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.29491668939590454, "epoch": 0.5220588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.8415739092045318, "kl": 0.021540533751249313, "learning_rate": 9.838276388430675e-07, "loss": 0.0161, "num_tokens": 13361277.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5539828538894653, "sampling/importance_sampling_ratio/mean": 1.0003516674041748, "sampling/importance_sampling_ratio/min": 0.6896569132804871, "sampling/sampling_logp_difference/max": 0.44082117080688477, "sampling/sampling_logp_difference/mean": 0.012835600413382053, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 255.484375, "completions/mean_terminated_length": 255.484375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.49975988268852234, "epoch": 0.5232843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.650289150718313, "kl": 0.020539449527859688, "learning_rate": 9.836474315195147e-07, "loss": 0.0224, "num_tokens": 13399964.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.9864652156829834, "sampling/importance_sampling_ratio/mean": 1.0003230571746826, "sampling/importance_sampling_ratio/min": 0.6403182148933411, "sampling/sampling_logp_difference/max": 0.686356782913208, "sampling/sampling_logp_difference/mean": 0.016608383506536484, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 193.84375, "completions/mean_terminated_length": 193.84375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.36028361320495605, "epoch": 0.5245098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.9515160665297485, "kl": 0.03193719685077667, "learning_rate": 9.83466242422316e-07, "loss": -0.0123, "num_tokens": 13427074.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4429755210876465, "sampling/importance_sampling_ratio/mean": 0.9999572038650513, "sampling/importance_sampling_ratio/min": 0.5236291289329529, "sampling/sampling_logp_difference/max": 0.6469717025756836, "sampling/sampling_logp_difference/mean": 0.01483781449496746, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 257.140625, "completions/mean_terminated_length": 257.140625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3348478078842163, "epoch": 0.5257352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 0.7325311645335302, "kl": 0.01973658800125122, "learning_rate": 9.832840719192735e-07, "loss": -0.0226, "num_tokens": 13462139.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.477462887763977, "sampling/importance_sampling_ratio/mean": 0.9998133182525635, "sampling/importance_sampling_ratio/min": 0.6276203393936157, "sampling/sampling_logp_difference/max": 0.4658198356628418, "sampling/sampling_logp_difference/mean": 0.012661002576351166, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 236.0625, "completions/mean_terminated_length": 236.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4094201326370239, "epoch": 0.5269607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.8411620741604495, "kl": 0.0227787084877491, "learning_rate": 9.831009203801822e-07, "loss": -0.0028, "num_tokens": 13497567.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6725372076034546, "sampling/importance_sampling_ratio/mean": 1.00007963180542, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.5143417119979858, "sampling/sampling_logp_difference/mean": 0.013968531042337418, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 283.78125, "completions/mean_terminated_length": 283.78125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35234811902046204, "epoch": 0.5281862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.7081715264001555, "kl": 0.02388506382703781, "learning_rate": 9.829167881768277e-07, "loss": 0.0058, "num_tokens": 13536129.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.542195439338684, "sampling/importance_sampling_ratio/mean": 1.0000916719436646, "sampling/importance_sampling_ratio/min": 0.6269434690475464, "sampling/sampling_logp_difference/max": 0.46689891815185547, "sampling/sampling_logp_difference/mean": 0.012889344245195389, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 282.828125, "completions/mean_terminated_length": 282.828125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.49479252099990845, "epoch": 0.5294117647058824, "frac_reward_zero_std": 0.5, "grad_norm": 1.057304346159149, "kl": 0.020965974777936935, "learning_rate": 9.82731675682987e-07, "loss": 0.0221, "num_tokens": 13574166.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3233028650283813, "sampling/importance_sampling_ratio/mean": 0.9997720122337341, "sampling/importance_sampling_ratio/min": 0.6226791739463806, "sampling/sampling_logp_difference/max": 0.4737238883972168, "sampling/sampling_logp_difference/mean": 0.01630120724439621, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 262.046875, "completions/mean_terminated_length": 262.046875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.49494457244873047, "epoch": 0.5306372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.9025425524831447, "kl": 0.030203722417354584, "learning_rate": 9.825455832744266e-07, "loss": -0.0046, "num_tokens": 13611705.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5823229551315308, "sampling/importance_sampling_ratio/mean": 0.999476432800293, "sampling/importance_sampling_ratio/min": 0.614302396774292, "sampling/sampling_logp_difference/max": 0.48726797103881836, "sampling/sampling_logp_difference/mean": 0.015790484845638275, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 330.78125, "completions/mean_terminated_length": 330.78125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.48657986521720886, "epoch": 0.5318627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.6149870979271623, "kl": 0.02005343697965145, "learning_rate": 9.823585113289023e-07, "loss": 0.0058, "num_tokens": 13661579.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2989569902420044, "sampling/importance_sampling_ratio/mean": 0.999963641166687, "sampling/importance_sampling_ratio/min": 0.6136351227760315, "sampling/sampling_logp_difference/max": 0.48835480213165283, "sampling/sampling_logp_difference/mean": 0.015401165932416916, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 294.6875, "completions/mean_terminated_length": 294.6875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4754374921321869, "epoch": 0.5330882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 0.76943731528402, "kl": 0.02414841577410698, "learning_rate": 9.821704602261585e-07, "loss": 0.023, "num_tokens": 13702807.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4890077114105225, "sampling/importance_sampling_ratio/mean": 0.9997965693473816, "sampling/importance_sampling_ratio/min": 0.6805422902107239, "sampling/sampling_logp_difference/max": 0.39810991287231445, "sampling/sampling_logp_difference/mean": 0.015343626961112022, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3381650447845459, "epoch": 0.5343137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.03057740845848974, "kl": 0.02493605762720108, "learning_rate": 9.819814303479267e-07, "loss": 0.0002, "num_tokens": 13734935.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4994944334030151, "sampling/importance_sampling_ratio/mean": 0.9998176097869873, "sampling/importance_sampling_ratio/min": 0.6151838302612305, "sampling/sampling_logp_difference/max": 0.48583412170410156, "sampling/sampling_logp_difference/mean": 0.012891847640275955, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 250.5625, "completions/mean_terminated_length": 250.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4581558108329773, "epoch": 0.5355392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.03482387842695008, "kl": 0.03028777986764908, "learning_rate": 9.817914220779256e-07, "loss": 0.0003, "num_tokens": 13769003.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.389121651649475, "sampling/importance_sampling_ratio/mean": 0.9999805688858032, "sampling/importance_sampling_ratio/min": 0.7036060690879822, "sampling/sampling_logp_difference/max": 0.3515366315841675, "sampling/sampling_logp_difference/mean": 0.015824012458324432, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 202.328125, "completions/mean_terminated_length": 202.328125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4181891977787018, "epoch": 0.5367647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0362809519320139, "kl": 0.03653764724731445, "learning_rate": 9.816004358018603e-07, "loss": 0.0003, "num_tokens": 13799360.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4336248636245728, "sampling/importance_sampling_ratio/mean": 1.0002336502075195, "sampling/importance_sampling_ratio/min": 0.6557809710502625, "sampling/sampling_logp_difference/max": 0.42192840576171875, "sampling/sampling_logp_difference/mean": 0.015442028641700745, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.35558366775512695, "epoch": 0.5379901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.030189696554764472, "kl": 0.03488951548933983, "learning_rate": 9.814084719074204e-07, "loss": 0.0003, "num_tokens": 13828560.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.621432900428772, "sampling/importance_sampling_ratio/mean": 0.9999073147773743, "sampling/importance_sampling_ratio/min": 0.6702842116355896, "sampling/sampling_logp_difference/max": 0.4833102226257324, "sampling/sampling_logp_difference/mean": 0.015155116096138954, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 264.4375, "completions/mean_terminated_length": 264.4375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.43071991205215454, "epoch": 0.5392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.591754916786323, "kl": 0.034047938883304596, "learning_rate": 9.81215530784281e-07, "loss": -0.007, "num_tokens": 13861340.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.6207441091537476, "sampling/importance_sampling_ratio/mean": 1.0001143217086792, "sampling/importance_sampling_ratio/min": 0.6139618158340454, "sampling/sampling_logp_difference/max": 0.4878225326538086, "sampling/sampling_logp_difference/mean": 0.014538413845002651, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 180.078125, "completions/mean_terminated_length": 180.078125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.36084920167922974, "epoch": 0.5404411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 1.1432552175845352, "kl": 0.038981202989816666, "learning_rate": 9.810216128240996e-07, "loss": 0.0332, "num_tokens": 13887153.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5471285581588745, "sampling/importance_sampling_ratio/mean": 1.0002764463424683, "sampling/importance_sampling_ratio/min": 0.6482198238372803, "sampling/sampling_logp_difference/max": 0.4364006519317627, "sampling/sampling_logp_difference/mean": 0.015175689943134785, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 244.5625, "completions/mean_terminated_length": 244.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.37251731753349304, "epoch": 0.5416666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.635142205905668, "kl": 0.02709903195500374, "learning_rate": 9.808267184205181e-07, "loss": 0.0009, "num_tokens": 13924437.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5991549491882324, "sampling/importance_sampling_ratio/mean": 1.0000724792480469, "sampling/importance_sampling_ratio/min": 0.3642270565032959, "sampling/sampling_logp_difference/max": 1.0099778175354004, "sampling/sampling_logp_difference/mean": 0.01269014272838831, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 287.84375, "completions/mean_terminated_length": 287.84375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5027158260345459, "epoch": 0.5428921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 0.9928813317044869, "kl": 0.029346946626901627, "learning_rate": 9.806308479691594e-07, "loss": -0.0264, "num_tokens": 13964091.0, "reward": -0.1875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3728333711624146, "sampling/importance_sampling_ratio/mean": 0.9996798038482666, "sampling/importance_sampling_ratio/min": 0.45287877321243286, "sampling/sampling_logp_difference/max": 0.7921308279037476, "sampling/sampling_logp_difference/mean": 0.015696164220571518, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 228.546875, "completions/mean_terminated_length": 228.546875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4584943652153015, "epoch": 0.5441176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 0.7849590758617065, "kl": 0.02762674354016781, "learning_rate": 9.80434001867628e-07, "loss": -0.0096, "num_tokens": 14003614.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5744317770004272, "sampling/importance_sampling_ratio/mean": 1.0002050399780273, "sampling/importance_sampling_ratio/min": 0.6228858232498169, "sampling/sampling_logp_difference/max": 0.4733920097351074, "sampling/sampling_logp_difference/mean": 0.01629599556326866, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 202.859375, "completions/mean_terminated_length": 202.859375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3557378947734833, "epoch": 0.5453431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.031780399021234605, "kl": 0.02672041766345501, "learning_rate": 9.802361805155097e-07, "loss": 0.0003, "num_tokens": 14029573.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5874168872833252, "sampling/importance_sampling_ratio/mean": 0.9998052716255188, "sampling/importance_sampling_ratio/min": 0.6247994899749756, "sampling/sampling_logp_difference/max": 0.4703245162963867, "sampling/sampling_logp_difference/mean": 0.015231041237711906, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 297.71875, "completions/mean_terminated_length": 297.71875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4070301055908203, "epoch": 0.5465686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.022448326964289424, "kl": 0.022481031715869904, "learning_rate": 9.800373843143683e-07, "loss": 0.0002, "num_tokens": 14076371.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6207866668701172, "sampling/importance_sampling_ratio/mean": 1.0000665187835693, "sampling/importance_sampling_ratio/min": 0.7068935632705688, "sampling/sampling_logp_difference/max": 0.4829115867614746, "sampling/sampling_logp_difference/mean": 0.014261187054216862, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 208.515625, "completions/mean_terminated_length": 208.515625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4191475212574005, "epoch": 0.5477941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.02705528768846311, "kl": 0.02209075167775154, "learning_rate": 9.798376136677484e-07, "loss": 0.0002, "num_tokens": 14106676.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3644667863845825, "sampling/importance_sampling_ratio/mean": 0.9999842643737793, "sampling/importance_sampling_ratio/min": 0.5770688056945801, "sampling/sampling_logp_difference/max": 0.5497937202453613, "sampling/sampling_logp_difference/mean": 0.015202060341835022, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.38473254442214966, "epoch": 0.5490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.024562931390235142, "kl": 0.022687768563628197, "learning_rate": 9.796368689811712e-07, "loss": 0.0002, "num_tokens": 14138428.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8606518507003784, "sampling/importance_sampling_ratio/mean": 0.999741792678833, "sampling/importance_sampling_ratio/min": 0.6147654056549072, "sampling/sampling_logp_difference/max": 0.6209268569946289, "sampling/sampling_logp_difference/mean": 0.0159025676548481, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 190.21875, "completions/mean_terminated_length": 190.21875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.37754613161087036, "epoch": 0.5502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.032390692048454, "kl": 0.0324709452688694, "learning_rate": 9.79435150662136e-07, "loss": 0.0366, "num_tokens": 14163786.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.614363431930542, "sampling/importance_sampling_ratio/mean": 1.0000146627426147, "sampling/importance_sampling_ratio/min": 0.6300994753837585, "sampling/sampling_logp_difference/max": 0.4789407253265381, "sampling/sampling_logp_difference/mean": 0.015572885051369667, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 253.796875, "completions/mean_terminated_length": 253.796875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3987857699394226, "epoch": 0.5514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.018155648464634834, "kl": 0.020753910765051842, "learning_rate": 9.792324591201177e-07, "loss": 0.0002, "num_tokens": 14203613.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.42256498336792, "sampling/importance_sampling_ratio/mean": 1.0002810955047607, "sampling/importance_sampling_ratio/min": 0.6784428358078003, "sampling/sampling_logp_difference/max": 0.38795506954193115, "sampling/sampling_logp_difference/mean": 0.014544595032930374, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 227.046875, "completions/mean_terminated_length": 227.046875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.42339175939559937, "epoch": 0.5526960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.0838351412506786, "kl": 0.02783692628145218, "learning_rate": 9.790287947665681e-07, "loss": -0.0175, "num_tokens": 14240032.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.8147363662719727, "sampling/importance_sampling_ratio/mean": 1.0001912117004395, "sampling/importance_sampling_ratio/min": 0.5722141861915588, "sampling/sampling_logp_difference/max": 0.5959402322769165, "sampling/sampling_logp_difference/mean": 0.016225896775722504, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 217.296875, "completions/mean_terminated_length": 217.296875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3951900899410248, "epoch": 0.553921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.2519750170992627, "kl": 0.022853782400488853, "learning_rate": 9.788241580149122e-07, "loss": 0.0703, "num_tokens": 14276403.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004253387451172, "sampling/importance_sampling_ratio/min": 0.3980225920677185, "sampling/sampling_logp_difference/max": 0.9212465286254883, "sampling/sampling_logp_difference/mean": 0.014143247157335281, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 189.140625, "completions/mean_terminated_length": 189.140625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3912501335144043, "epoch": 0.5551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9601244698291272, "kl": 0.02590246871113777, "learning_rate": 9.786185492805501e-07, "loss": -0.0114, "num_tokens": 14303964.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5964021682739258, "sampling/importance_sampling_ratio/mean": 1.0004160404205322, "sampling/importance_sampling_ratio/min": 0.7093971967697144, "sampling/sampling_logp_difference/max": 0.46775245666503906, "sampling/sampling_logp_difference/mean": 0.015068481676280499, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 182.421875, "completions/mean_terminated_length": 182.421875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.37030190229415894, "epoch": 0.5563725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.1338304084887485, "kl": 0.028160633519291878, "learning_rate": 9.784119689808542e-07, "loss": 0.0073, "num_tokens": 14335335.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4796181917190552, "sampling/importance_sampling_ratio/mean": 1.0000211000442505, "sampling/importance_sampling_ratio/min": 0.6792578101158142, "sampling/sampling_logp_difference/max": 0.39178407192230225, "sampling/sampling_logp_difference/mean": 0.015386087819933891, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 177.765625, "completions/mean_terminated_length": 177.765625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.34240347146987915, "epoch": 0.5575980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02345553308197358, "kl": 0.028486458584666252, "learning_rate": 9.782044175351699e-07, "loss": 0.0003, "num_tokens": 14366856.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.42637038230896, "sampling/importance_sampling_ratio/mean": 0.9996859431266785, "sampling/importance_sampling_ratio/min": 0.6485440731048584, "sampling/sampling_logp_difference/max": 0.4330253601074219, "sampling/sampling_logp_difference/mean": 0.015499918721616268, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 190.71875, "completions/mean_terminated_length": 190.71875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.47459185123443604, "epoch": 0.5588235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.0209203187628284, "kl": 0.031104128807783127, "learning_rate": 9.779958953648129e-07, "loss": 0.0363, "num_tokens": 14398294.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3172099590301514, "sampling/importance_sampling_ratio/mean": 0.999106764793396, "sampling/importance_sampling_ratio/min": 0.6602303981781006, "sampling/sampling_logp_difference/max": 0.415166437625885, "sampling/sampling_logp_difference/mean": 0.016539257019758224, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 174.828125, "completions/mean_terminated_length": 174.828125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.42182838916778564, "epoch": 0.5600490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 1.021459722073482, "kl": 0.02934981882572174, "learning_rate": 9.777864028930705e-07, "loss": -0.0026, "num_tokens": 14425291.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.483134150505066, "sampling/importance_sampling_ratio/mean": 0.9997996687889099, "sampling/importance_sampling_ratio/min": 0.6134141683578491, "sampling/sampling_logp_difference/max": 0.48871493339538574, "sampling/sampling_logp_difference/mean": 0.017257895320653915, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 164.28125, "completions/mean_terminated_length": 164.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2947123050689697, "epoch": 0.5612745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.9367905375406443, "kl": 0.032778844237327576, "learning_rate": 9.775759405451986e-07, "loss": -0.0105, "num_tokens": 14456493.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4753822088241577, "sampling/importance_sampling_ratio/mean": 1.00032639503479, "sampling/importance_sampling_ratio/min": 0.7047656178474426, "sampling/sampling_logp_difference/max": 0.38891708850860596, "sampling/sampling_logp_difference/mean": 0.01345054805278778, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 174.65625, "completions/mean_terminated_length": 174.65625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.417305052280426, "epoch": 0.5625, "frac_reward_zero_std": 0.5, "grad_norm": 1.0784437365504824, "kl": 0.036820050328969955, "learning_rate": 9.773645087484228e-07, "loss": 0.0099, "num_tokens": 14484567.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.380178689956665, "sampling/importance_sampling_ratio/mean": 0.9998287558555603, "sampling/importance_sampling_ratio/min": 0.6753222942352295, "sampling/sampling_logp_difference/max": 0.3925652503967285, "sampling/sampling_logp_difference/mean": 0.017081189900636673, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.36164188385009766, "epoch": 0.5637254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.4720842033761148, "kl": 0.03700194135308266, "learning_rate": 9.771521079319363e-07, "loss": 0.0037, "num_tokens": 14516623.0, "reward": 0.84375, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.571526050567627, "sampling/importance_sampling_ratio/mean": 1.0001332759857178, "sampling/importance_sampling_ratio/min": 0.4859488904476166, "sampling/sampling_logp_difference/max": 0.7216517925262451, "sampling/sampling_logp_difference/mean": 0.014842424541711807, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.40478748083114624, "epoch": 0.5649509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.9782248174795586, "kl": 0.03830481320619583, "learning_rate": 9.76938738526899e-07, "loss": -0.0054, "num_tokens": 14549359.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6708135604858398, "sampling/importance_sampling_ratio/mean": 1.000866174697876, "sampling/importance_sampling_ratio/min": 0.5204440951347351, "sampling/sampling_logp_difference/max": 0.6530728340148926, "sampling/sampling_logp_difference/mean": 0.017827894538640976, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 174.765625, "completions/mean_terminated_length": 174.765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3068470358848572, "epoch": 0.5661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.031222083588003047, "kl": 0.028238721191883087, "learning_rate": 9.767244009664376e-07, "loss": 0.0003, "num_tokens": 14583200.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4348357915878296, "sampling/importance_sampling_ratio/mean": 0.9999443292617798, "sampling/importance_sampling_ratio/min": 0.6481047868728638, "sampling/sampling_logp_difference/max": 0.4337029457092285, "sampling/sampling_logp_difference/mean": 0.012765954248607159, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 188.0625, "completions/mean_terminated_length": 188.0625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.40620073676109314, "epoch": 0.5674019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7900803798163197, "kl": 0.03524962067604065, "learning_rate": 9.765090956856435e-07, "loss": -0.0037, "num_tokens": 14613636.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4808640480041504, "sampling/importance_sampling_ratio/mean": 0.9999431371688843, "sampling/importance_sampling_ratio/min": 0.6116536855697632, "sampling/sampling_logp_difference/max": 0.4915890693664551, "sampling/sampling_logp_difference/mean": 0.01594621129333973, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 204.609375, "completions/mean_terminated_length": 204.609375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4412503242492676, "epoch": 0.5686274509803921, "frac_reward_zero_std": 0.25, "grad_norm": 1.466674119828152, "kl": 0.043802544474601746, "learning_rate": 9.76292823121573e-07, "loss": 0.0358, "num_tokens": 14647179.0, "reward": 0.71875, "reward_std": 0.565913200378418, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.9443401098251343, "sampling/importance_sampling_ratio/mean": 1.0002394914627075, "sampling/importance_sampling_ratio/min": 0.6217594146728516, "sampling/sampling_logp_difference/max": 0.6649227142333984, "sampling/sampling_logp_difference/mean": 0.01716477796435356, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 216.15625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4255213439464569, "epoch": 0.5698529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.138903439348918, "kl": 0.03624618798494339, "learning_rate": 9.760755837132457e-07, "loss": 0.024, "num_tokens": 14686613.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4094603061676025, "sampling/importance_sampling_ratio/mean": 1.000252604484558, "sampling/importance_sampling_ratio/min": 0.3503970205783844, "sampling/sampling_logp_difference/max": 1.0486884117126465, "sampling/sampling_logp_difference/mean": 0.015629207715392113, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.35965681076049805, "epoch": 0.571078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 1.1053258217089428, "kl": 0.04981109872460365, "learning_rate": 9.758573779016436e-07, "loss": 0.0045, "num_tokens": 14707037.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3779658079147339, "sampling/importance_sampling_ratio/mean": 0.9993205666542053, "sampling/importance_sampling_ratio/min": 0.6660267114639282, "sampling/sampling_logp_difference/max": 0.40642547607421875, "sampling/sampling_logp_difference/mean": 0.01673746109008789, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 173.90625, "completions/mean_terminated_length": 173.90625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.36547723412513733, "epoch": 0.5723039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 1.0466216720434813, "kl": 0.0352628119289875, "learning_rate": 9.75638206129711e-07, "loss": -0.0227, "num_tokens": 14733463.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.622634768486023, "sampling/importance_sampling_ratio/mean": 1.0002810955047607, "sampling/importance_sampling_ratio/min": 0.7660991549491882, "sampling/sampling_logp_difference/max": 0.4840512275695801, "sampling/sampling_logp_difference/mean": 0.015148546546697617, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 178.28125, "completions/mean_terminated_length": 178.28125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.5210790634155273, "epoch": 0.5735294117647058, "frac_reward_zero_std": 0.5, "grad_norm": 1.2991492167785403, "kl": 0.04535311460494995, "learning_rate": 9.754180688423524e-07, "loss": -0.0156, "num_tokens": 14763913.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4715548753738403, "sampling/importance_sampling_ratio/mean": 1.000481128692627, "sampling/importance_sampling_ratio/min": 0.6172164082527161, "sampling/sampling_logp_difference/max": 0.48253560066223145, "sampling/sampling_logp_difference/mean": 0.01772424206137657, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 182.046875, "completions/mean_terminated_length": 182.046875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.46314379572868347, "epoch": 0.5747549019607843, "frac_reward_zero_std": 0.25, "grad_norm": 1.9744322865687498, "kl": 0.05717692896723747, "learning_rate": 9.751969664864326e-07, "loss": -0.0096, "num_tokens": 14794892.0, "reward": 0.46875, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5310888290405273, "sampling/importance_sampling_ratio/mean": 1.0001671314239502, "sampling/importance_sampling_ratio/min": 0.6671816110610962, "sampling/sampling_logp_difference/max": 0.4259791374206543, "sampling/sampling_logp_difference/mean": 0.016762029379606247, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 189.65625, "completions/mean_terminated_length": 189.65625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.48477575182914734, "epoch": 0.5759803921568627, "frac_reward_zero_std": 0.25, "grad_norm": 1.3977615783156871, "kl": 0.04280099272727966, "learning_rate": 9.749748995107756e-07, "loss": 0.0007, "num_tokens": 14824502.0, "reward": 0.46875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997283816337585, "sampling/importance_sampling_ratio/min": 0.687197208404541, "sampling/sampling_logp_difference/max": 0.7038180828094482, "sampling/sampling_logp_difference/mean": 0.016724707558751106, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 183.1875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3991392254829407, "epoch": 0.5772058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.032043962557941105, "kl": 0.039326563477516174, "learning_rate": 9.74751868366163e-07, "loss": 0.0004, "num_tokens": 14854210.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3368841409683228, "sampling/importance_sampling_ratio/mean": 1.0000734329223633, "sampling/importance_sampling_ratio/min": 0.6670835614204407, "sampling/sampling_logp_difference/max": 0.40483999252319336, "sampling/sampling_logp_difference/mean": 0.013921466656029224, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 229.921875, "completions/mean_terminated_length": 229.921875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.48380720615386963, "epoch": 0.5784313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 0.7879687250330961, "kl": 0.03630056232213974, "learning_rate": 9.745278735053343e-07, "loss": 0.0047, "num_tokens": 14897197.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.83116614818573, "sampling/importance_sampling_ratio/mean": 1.0003652572631836, "sampling/importance_sampling_ratio/min": 0.3702128529548645, "sampling/sampling_logp_difference/max": 0.9936771392822266, "sampling/sampling_logp_difference/mean": 0.016720673069357872, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.40632766485214233, "epoch": 0.5796568627450981, "frac_reward_zero_std": 0.5, "grad_norm": 1.5074305833058312, "kl": 0.04674538969993591, "learning_rate": 9.743029153829845e-07, "loss": -0.0353, "num_tokens": 14928265.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.527808427810669, "sampling/importance_sampling_ratio/mean": 0.9999127984046936, "sampling/importance_sampling_ratio/min": 0.6771559119224548, "sampling/sampling_logp_difference/max": 0.42383432388305664, "sampling/sampling_logp_difference/mean": 0.015889152884483337, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 165.328125, "completions/mean_terminated_length": 165.328125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4638434648513794, "epoch": 0.5808823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 1.0500186038726147, "kl": 0.039422713220119476, "learning_rate": 9.740769944557644e-07, "loss": 0.0184, "num_tokens": 14962702.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6118754148483276, "sampling/importance_sampling_ratio/mean": 0.9996562600135803, "sampling/importance_sampling_ratio/min": 0.6100545525550842, "sampling/sampling_logp_difference/max": 0.49420690536499023, "sampling/sampling_logp_difference/mean": 0.01637447066605091, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 195.546875, "completions/mean_terminated_length": 195.546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.41811755299568176, "epoch": 0.5821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.8360398958246319, "kl": 0.039523884654045105, "learning_rate": 9.738501111822792e-07, "loss": -0.0022, "num_tokens": 14997889.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.536515712738037, "sampling/importance_sampling_ratio/mean": 0.999782383441925, "sampling/importance_sampling_ratio/min": 0.5495277643203735, "sampling/sampling_logp_difference/max": 0.5986959934234619, "sampling/sampling_logp_difference/mean": 0.015316365286707878, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 149.59375, "completions/mean_terminated_length": 149.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.29053980112075806, "epoch": 0.5833333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.03875701398066368, "kl": 0.0342644602060318, "learning_rate": 9.736222660230878e-07, "loss": 0.0003, "num_tokens": 15029351.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6604468822479248, "sampling/importance_sampling_ratio/mean": 1.0005455017089844, "sampling/importance_sampling_ratio/min": 0.5771161913871765, "sampling/sampling_logp_difference/max": 0.5497117042541504, "sampling/sampling_logp_difference/mean": 0.014183287508785725, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 200.4375, "completions/mean_terminated_length": 200.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4234068989753723, "epoch": 0.5845588235294118, "frac_reward_zero_std": 0.75, "grad_norm": 0.8786220431503255, "kl": 0.03679296374320984, "learning_rate": 9.73393459440701e-07, "loss": -0.0214, "num_tokens": 15062067.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.574912667274475, "sampling/importance_sampling_ratio/mean": 1.0002837181091309, "sampling/importance_sampling_ratio/min": 0.6802206039428711, "sampling/sampling_logp_difference/max": 0.45419979095458984, "sampling/sampling_logp_difference/mean": 0.017016585916280746, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 200.90625, "completions/mean_terminated_length": 200.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4000805914402008, "epoch": 0.5857843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.032605977714361904, "kl": 0.03804188221693039, "learning_rate": 9.73163691899582e-07, "loss": 0.0004, "num_tokens": 15095037.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4352234601974487, "sampling/importance_sampling_ratio/mean": 0.9999063014984131, "sampling/importance_sampling_ratio/min": 0.652175784111023, "sampling/sampling_logp_difference/max": 0.4274411201477051, "sampling/sampling_logp_difference/mean": 0.015267307870090008, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 130.78125, "completions/mean_terminated_length": 130.78125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3270447254180908, "epoch": 0.5870098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.0818031132066426, "kl": 0.055767759680747986, "learning_rate": 9.729329638661444e-07, "loss": 0.0055, "num_tokens": 15121471.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3264274597167969, "sampling/importance_sampling_ratio/mean": 0.9994399547576904, "sampling/importance_sampling_ratio/min": 0.6381850838661194, "sampling/sampling_logp_difference/max": 0.4491269588470459, "sampling/sampling_logp_difference/mean": 0.015719007700681686, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 231.59375, "completions/mean_terminated_length": 231.59375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3547309637069702, "epoch": 0.5882352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.04431702110735289, "kl": 0.039026688784360886, "learning_rate": 9.727012758087512e-07, "loss": 0.0004, "num_tokens": 15155941.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.507184624671936, "sampling/importance_sampling_ratio/mean": 1.0004100799560547, "sampling/importance_sampling_ratio/min": 0.6588627099990845, "sampling/sampling_logp_difference/max": 0.4172401428222656, "sampling/sampling_logp_difference/mean": 0.013316002674400806, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.39464816451072693, "epoch": 0.5894607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.028339915855737342, "kl": 0.04644651710987091, "learning_rate": 9.724686281977146e-07, "loss": 0.0004, "num_tokens": 15190197.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.36279296875, "sampling/importance_sampling_ratio/mean": 0.9995285272598267, "sampling/importance_sampling_ratio/min": 0.6821646690368652, "sampling/sampling_logp_difference/max": 0.38248419761657715, "sampling/sampling_logp_difference/mean": 0.014964552596211433, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.43031764030456543, "epoch": 0.5906862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.1476481998532106, "kl": 0.053885385394096375, "learning_rate": 9.722350215052946e-07, "loss": 0.0313, "num_tokens": 15224717.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.634546160697937, "sampling/importance_sampling_ratio/mean": 1.0000656843185425, "sampling/importance_sampling_ratio/min": 0.6249037981033325, "sampling/sampling_logp_difference/max": 0.4913651943206787, "sampling/sampling_logp_difference/mean": 0.016320811584591866, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 173.953125, "completions/mean_terminated_length": 173.953125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3014456629753113, "epoch": 0.5919117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.03749685995290343, "kl": 0.04681577906012535, "learning_rate": 9.720004562056979e-07, "loss": 0.0004, "num_tokens": 15253738.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.418946623802185, "sampling/importance_sampling_ratio/mean": 1.0003348588943481, "sampling/importance_sampling_ratio/min": 0.6374188661575317, "sampling/sampling_logp_difference/max": 0.45032835006713867, "sampling/sampling_logp_difference/mean": 0.013938084244728088, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 187.359375, "completions/mean_terminated_length": 187.359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.37069573998451233, "epoch": 0.5931372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.07186512216651662, "kl": 0.04863576591014862, "learning_rate": 9.717649327750773e-07, "loss": 0.0005, "num_tokens": 15284929.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6029527187347412, "sampling/importance_sampling_ratio/mean": 1.0000958442687988, "sampling/importance_sampling_ratio/min": 0.6163931488990784, "sampling/sampling_logp_difference/max": 0.483870267868042, "sampling/sampling_logp_difference/mean": 0.01496157981455326, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.31721800565719604, "epoch": 0.5943627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.025045108273247088, "kl": 0.03179796040058136, "learning_rate": 9.7152845169153e-07, "loss": 0.0003, "num_tokens": 15314219.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3244808912277222, "sampling/importance_sampling_ratio/mean": 0.9997227191925049, "sampling/importance_sampling_ratio/min": 0.6771866083145142, "sampling/sampling_logp_difference/max": 0.38980841636657715, "sampling/sampling_logp_difference/mean": 0.013248606584966183, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 226.078125, "completions/mean_terminated_length": 226.078125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4020075798034668, "epoch": 0.5955882352941176, "frac_reward_zero_std": 0.75, "grad_norm": 0.865404599052011, "kl": 0.03322160243988037, "learning_rate": 9.712910134350984e-07, "loss": -0.0068, "num_tokens": 15346192.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5245110988616943, "sampling/importance_sampling_ratio/mean": 0.9994904398918152, "sampling/importance_sampling_ratio/min": 0.6649379134178162, "sampling/sampling_logp_difference/max": 0.4216737747192383, "sampling/sampling_logp_difference/mean": 0.01634867489337921, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 165.921875, "completions/mean_terminated_length": 165.921875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.32319891452789307, "epoch": 0.5968137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.05301787512221058, "kl": 0.044561855494976044, "learning_rate": 9.710526184877666e-07, "loss": 0.0004, "num_tokens": 15371083.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5076807737350464, "sampling/importance_sampling_ratio/mean": 0.9999871253967285, "sampling/importance_sampling_ratio/min": 0.6214069724082947, "sampling/sampling_logp_difference/max": 0.47576904296875, "sampling/sampling_logp_difference/mean": 0.015059342607855797, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3339904248714447, "epoch": 0.5980392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.9118170726402404, "kl": 0.049351539462804794, "learning_rate": 9.708132673334615e-07, "loss": 0.0067, "num_tokens": 15397555.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3671423196792603, "sampling/importance_sampling_ratio/mean": 1.0000923871994019, "sampling/importance_sampling_ratio/min": 0.5302200317382812, "sampling/sampling_logp_difference/max": 0.6344633102416992, "sampling/sampling_logp_difference/mean": 0.0142977274954319, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 217.171875, "completions/mean_terminated_length": 217.171875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.369615375995636, "epoch": 0.5992647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.8277067553657047, "kl": 0.032596759498119354, "learning_rate": 9.705729604580505e-07, "loss": -0.0057, "num_tokens": 15428302.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5348131656646729, "sampling/importance_sampling_ratio/mean": 0.9999060034751892, "sampling/importance_sampling_ratio/min": 0.4914447069168091, "sampling/sampling_logp_difference/max": 0.7104058265686035, "sampling/sampling_logp_difference/mean": 0.014864052645862103, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 197.1875, "completions/mean_terminated_length": 197.1875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.30024808645248413, "epoch": 0.6004901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 0.791695814136919, "kl": 0.04219331592321396, "learning_rate": 9.703316983493412e-07, "loss": -0.0198, "num_tokens": 15457626.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6603162288665771, "sampling/importance_sampling_ratio/mean": 1.0006078481674194, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.5070080757141113, "sampling/sampling_logp_difference/mean": 0.013717526569962502, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3099389374256134, "epoch": 0.6017156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.022027601431333, "kl": 0.04899393394589424, "learning_rate": 9.700894814970808e-07, "loss": 0.0799, "num_tokens": 15486074.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5108519792556763, "sampling/importance_sampling_ratio/mean": 1.0001444816589355, "sampling/importance_sampling_ratio/min": 0.7161623239517212, "sampling/sampling_logp_difference/max": 0.4126737117767334, "sampling/sampling_logp_difference/mean": 0.01254919171333313, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 192.21875, "completions/mean_terminated_length": 192.21875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32838335633277893, "epoch": 0.6029411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.027043875500573203, "kl": 0.03538636118173599, "learning_rate": 9.698463103929541e-07, "loss": 0.0004, "num_tokens": 15514776.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3303242921829224, "sampling/importance_sampling_ratio/mean": 1.000047206878662, "sampling/importance_sampling_ratio/min": 0.7300384640693665, "sampling/sampling_logp_difference/max": 0.3146580457687378, "sampling/sampling_logp_difference/mean": 0.012933471240103245, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 164.21875, "completions/mean_terminated_length": 164.21875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.31112101674079895, "epoch": 0.6041666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03344291584823396, "kl": 0.041676055639982224, "learning_rate": 9.69602185530583e-07, "loss": 0.0004, "num_tokens": 15543478.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5971465110778809, "sampling/importance_sampling_ratio/mean": 0.9997182488441467, "sampling/importance_sampling_ratio/min": 0.6483074426651001, "sampling/sampling_logp_difference/max": 0.4682185649871826, "sampling/sampling_logp_difference/mean": 0.013913290575146675, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 201.703125, "completions/mean_terminated_length": 201.703125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.40549153089523315, "epoch": 0.6053921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9591883322297038, "kl": 0.031131578609347343, "learning_rate": 9.693571074055254e-07, "loss": 0.0042, "num_tokens": 15572531.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4643051624298096, "sampling/importance_sampling_ratio/mean": 0.9999620318412781, "sampling/importance_sampling_ratio/min": 0.5960602164268494, "sampling/sampling_logp_difference/max": 0.5174136161804199, "sampling/sampling_logp_difference/mean": 0.016925688832998276, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 199.671875, "completions/mean_terminated_length": 199.671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.36816954612731934, "epoch": 0.6066176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 0.9265582481618314, "kl": 0.04405049607157707, "learning_rate": 9.691110765152744e-07, "loss": -0.0131, "num_tokens": 15603278.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6700979471206665, "sampling/importance_sampling_ratio/mean": 1.0002562999725342, "sampling/importance_sampling_ratio/min": 0.612542986869812, "sampling/sampling_logp_difference/max": 0.5128822326660156, "sampling/sampling_logp_difference/mean": 0.01636900007724762, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 234.359375, "completions/mean_terminated_length": 234.359375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.28878411650657654, "epoch": 0.6078431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.01999654005497948, "kl": 0.027198560535907745, "learning_rate": 9.688640933592572e-07, "loss": 0.0003, "num_tokens": 15633573.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4348645210266113, "sampling/importance_sampling_ratio/mean": 1.00016188621521, "sampling/importance_sampling_ratio/min": 0.6262774467468262, "sampling/sampling_logp_difference/max": 0.46796178817749023, "sampling/sampling_logp_difference/mean": 0.011264925822615623, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3872169256210327, "epoch": 0.6090686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.030476085308733487, "kl": 0.027259351685643196, "learning_rate": 9.686161584388339e-07, "loss": 0.0003, "num_tokens": 15667861.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6089355945587158, "sampling/importance_sampling_ratio/mean": 0.9999793171882629, "sampling/importance_sampling_ratio/min": 0.4886479079723358, "sampling/sampling_logp_difference/max": 0.7161130905151367, "sampling/sampling_logp_difference/mean": 0.015679148957133293, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 220.609375, "completions/mean_terminated_length": 220.609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.4441712200641632, "epoch": 0.6102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.02724183738489788, "kl": 0.03278857469558716, "learning_rate": 9.683672722572966e-07, "loss": 0.0003, "num_tokens": 15698796.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4401459693908691, "sampling/importance_sampling_ratio/mean": 1.0000853538513184, "sampling/importance_sampling_ratio/min": 0.6875020265579224, "sampling/sampling_logp_difference/max": 0.37469053268432617, "sampling/sampling_logp_difference/mean": 0.016673587262630463, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 232.828125, "completions/mean_terminated_length": 232.828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.35596445202827454, "epoch": 0.6115196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.03127872159983417, "kl": 0.030974477529525757, "learning_rate": 9.681174353198686e-07, "loss": 0.0003, "num_tokens": 15731729.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.479277491569519, "sampling/importance_sampling_ratio/mean": 0.9996037483215332, "sampling/importance_sampling_ratio/min": 0.6136996150016785, "sampling/sampling_logp_difference/max": 0.4882497787475586, "sampling/sampling_logp_difference/mean": 0.014733528718352318, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 178.984375, "completions/mean_terminated_length": 178.984375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.39300116896629333, "epoch": 0.6127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03591396388828622, "kl": 0.03897593915462494, "learning_rate": 9.678666481337031e-07, "loss": 0.0004, "num_tokens": 15761648.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5277884006500244, "sampling/importance_sampling_ratio/mean": 0.9996728301048279, "sampling/importance_sampling_ratio/min": 0.7004489302635193, "sampling/sampling_logp_difference/max": 0.42382121086120605, "sampling/sampling_logp_difference/mean": 0.016662370413541794, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 267.265625, "completions/mean_terminated_length": 267.265625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3144611716270447, "epoch": 0.6139705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 0.749493892103405, "kl": 0.02991885133087635, "learning_rate": 9.67614911207882e-07, "loss": 0.0046, "num_tokens": 15795393.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3558284044265747, "sampling/importance_sampling_ratio/mean": 1.0002613067626953, "sampling/importance_sampling_ratio/min": 0.6464096307754517, "sampling/sampling_logp_difference/max": 0.4363219738006592, "sampling/sampling_logp_difference/mean": 0.013599826022982597, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 276.828125, "completions/mean_terminated_length": 276.828125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.2665802836418152, "epoch": 0.6151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.017226321367376545, "kl": 0.021591052412986755, "learning_rate": 9.673622250534155e-07, "loss": 0.0002, "num_tokens": 15832678.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.465712070465088, "sampling/importance_sampling_ratio/mean": 1.0006533861160278, "sampling/importance_sampling_ratio/min": 0.6309396624565125, "sampling/sampling_logp_difference/max": 0.46054506301879883, "sampling/sampling_logp_difference/mean": 0.011435139924287796, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2680723965167999, "epoch": 0.616421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.06538297599717596, "kl": 0.04390225559473038, "learning_rate": 9.671085901832404e-07, "loss": 0.0004, "num_tokens": 15855342.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5716720819473267, "sampling/importance_sampling_ratio/mean": 0.9990843534469604, "sampling/importance_sampling_ratio/min": 0.617487370967865, "sampling/sampling_logp_difference/max": 0.48209667205810547, "sampling/sampling_logp_difference/mean": 0.01410925853997469, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 226.421875, "completions/mean_terminated_length": 226.421875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2998732924461365, "epoch": 0.6176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.028155419050621314, "kl": 0.027188263833522797, "learning_rate": 9.668540071122195e-07, "loss": 0.0003, "num_tokens": 15885801.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5242469310760498, "sampling/importance_sampling_ratio/mean": 0.9993939399719238, "sampling/importance_sampling_ratio/min": 0.623637855052948, "sampling/sampling_logp_difference/max": 0.4721853733062744, "sampling/sampling_logp_difference/mean": 0.015196947380900383, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 229.171875, "completions/mean_terminated_length": 229.171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.33026987314224243, "epoch": 0.6188725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.905760075770604, "kl": 0.030161835253238678, "learning_rate": 9.665984763571402e-07, "loss": 0.011, "num_tokens": 15918836.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4756348133087158, "sampling/importance_sampling_ratio/mean": 1.0005979537963867, "sampling/importance_sampling_ratio/min": 0.5976086854934692, "sampling/sampling_logp_difference/max": 0.5148191452026367, "sampling/sampling_logp_difference/mean": 0.013623598031699657, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.35385337471961975, "epoch": 0.6200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.041170782730491726, "kl": 0.0330902636051178, "learning_rate": 9.663419984367137e-07, "loss": 0.0003, "num_tokens": 15951116.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8479071855545044, "sampling/importance_sampling_ratio/mean": 1.0002179145812988, "sampling/importance_sampling_ratio/min": 0.6587467789649963, "sampling/sampling_logp_difference/max": 0.6140537261962891, "sampling/sampling_logp_difference/mean": 0.014573907479643822, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 227.015625, "completions/mean_terminated_length": 227.015625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3508400321006775, "epoch": 0.6213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02524040177034461, "kl": 0.027214359492063522, "learning_rate": 9.660845738715742e-07, "loss": 0.0003, "num_tokens": 15981533.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9693646430969238, "sampling/importance_sampling_ratio/mean": 0.9999514818191528, "sampling/importance_sampling_ratio/min": 0.6059155464172363, "sampling/sampling_logp_difference/max": 0.677711009979248, "sampling/sampling_logp_difference/mean": 0.01470345351845026, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 238.421875, "completions/mean_terminated_length": 238.421875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4612578749656677, "epoch": 0.6225490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.7956611667635086, "kl": 0.0296800397336483, "learning_rate": 9.658262031842769e-07, "loss": 0.0164, "num_tokens": 16015192.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5379420518875122, "sampling/importance_sampling_ratio/mean": 1.0001564025878906, "sampling/importance_sampling_ratio/min": 0.3161124885082245, "sampling/sampling_logp_difference/max": 1.1516571044921875, "sampling/sampling_logp_difference/mean": 0.015863286331295967, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.34082987904548645, "epoch": 0.6237745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.030028764226007094, "kl": 0.03355834260582924, "learning_rate": 9.655668868992983e-07, "loss": 0.0003, "num_tokens": 16048516.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5475269556045532, "sampling/importance_sampling_ratio/mean": 1.0004501342773438, "sampling/importance_sampling_ratio/min": 0.6653298139572144, "sampling/sampling_logp_difference/max": 0.4366581439971924, "sampling/sampling_logp_difference/mean": 0.014498144388198853, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 168.8125, "completions/mean_terminated_length": 168.8125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.28656935691833496, "epoch": 0.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.03789431316473932, "kl": 0.029552018269896507, "learning_rate": 9.653066255430338e-07, "loss": 0.0003, "num_tokens": 16074808.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4943023920059204, "sampling/importance_sampling_ratio/mean": 0.999640703201294, "sampling/importance_sampling_ratio/min": 0.5432738065719604, "sampling/sampling_logp_difference/max": 0.6101418733596802, "sampling/sampling_logp_difference/mean": 0.013910012319684029, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 249.71875, "completions/mean_terminated_length": 249.71875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3192594051361084, "epoch": 0.6262254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.018134661715106467, "kl": 0.021534759551286697, "learning_rate": 9.650454196437973e-07, "loss": 0.0002, "num_tokens": 16106294.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4958059787750244, "sampling/importance_sampling_ratio/mean": 1.0002377033233643, "sampling/importance_sampling_ratio/min": 0.718105137348175, "sampling/sampling_logp_difference/max": 0.40266525745391846, "sampling/sampling_logp_difference/mean": 0.013101302087306976, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 245.171875, "completions/mean_terminated_length": 245.171875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.32846811413764954, "epoch": 0.6274509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6441520351435096, "kl": 0.025811253115534782, "learning_rate": 9.647832697318206e-07, "loss": -0.027, "num_tokens": 16141777.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4265540838241577, "sampling/importance_sampling_ratio/mean": 1.0003132820129395, "sampling/importance_sampling_ratio/min": 0.5689274668693542, "sampling/sampling_logp_difference/max": 0.564002275466919, "sampling/sampling_logp_difference/mean": 0.013478966429829597, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 241.71875, "completions/mean_terminated_length": 241.71875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3434150815010071, "epoch": 0.6286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02379823909954446, "kl": 0.029206233099102974, "learning_rate": 9.645201763392513e-07, "loss": 0.0003, "num_tokens": 16175023.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5291852951049805, "sampling/importance_sampling_ratio/mean": 1.0002974271774292, "sampling/importance_sampling_ratio/min": 0.6547035574913025, "sampling/sampling_logp_difference/max": 0.42473506927490234, "sampling/sampling_logp_difference/mean": 0.013717292807996273, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3104274272918701, "epoch": 0.6299019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.03345929852069136, "kl": 0.03869575262069702, "learning_rate": 9.64256140000152e-07, "loss": 0.0004, "num_tokens": 16203639.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5091041326522827, "sampling/importance_sampling_ratio/mean": 0.9998499155044556, "sampling/importance_sampling_ratio/min": 0.6605297327041626, "sampling/sampling_logp_difference/max": 0.41471314430236816, "sampling/sampling_logp_difference/mean": 0.014581705443561077, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 184.5625, "completions/mean_terminated_length": 184.5625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3684101402759552, "epoch": 0.6311274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.02124262357646272, "kl": 0.026777107268571854, "learning_rate": 9.639911612505003e-07, "loss": 0.0003, "num_tokens": 16236283.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4286984205245972, "sampling/importance_sampling_ratio/mean": 0.9999425411224365, "sampling/importance_sampling_ratio/min": 0.6769490242004395, "sampling/sampling_logp_difference/max": 0.39015936851501465, "sampling/sampling_logp_difference/mean": 0.014349126257002354, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3284221291542053, "epoch": 0.6323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.030142720478342194, "kl": 0.031212424859404564, "learning_rate": 9.63725240628186e-07, "loss": 0.0003, "num_tokens": 16267115.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3817118406295776, "sampling/importance_sampling_ratio/mean": 0.9998214840888977, "sampling/importance_sampling_ratio/min": 0.5461640357971191, "sampling/sampling_logp_difference/max": 0.6048359870910645, "sampling/sampling_logp_difference/mean": 0.013921651989221573, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 182.609375, "completions/mean_terminated_length": 182.609375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4168780744075775, "epoch": 0.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.026171104157375946, "kl": 0.03219608962535858, "learning_rate": 9.634583786730108e-07, "loss": 0.0003, "num_tokens": 16297090.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.764905333518982, "sampling/importance_sampling_ratio/mean": 1.0003851652145386, "sampling/importance_sampling_ratio/min": 0.6018215417861938, "sampling/sampling_logp_difference/max": 0.5680971145629883, "sampling/sampling_logp_difference/mean": 0.016892993822693825, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3474350869655609, "epoch": 0.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.022862287577593118, "kl": 0.02698478288948536, "learning_rate": 9.63190575926688e-07, "loss": 0.0003, "num_tokens": 16330642.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744448900222778, "sampling/importance_sampling_ratio/mean": 0.9994927644729614, "sampling/importance_sampling_ratio/min": 0.732742965221405, "sampling/sampling_logp_difference/max": 0.4539027214050293, "sampling/sampling_logp_difference/mean": 0.013669499196112156, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 241.859375, "completions/mean_terminated_length": 241.859375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4381487965583801, "epoch": 0.6360294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.7083376676619356, "kl": 0.026370543986558914, "learning_rate": 9.6292183293284e-07, "loss": 0.0019, "num_tokens": 16365689.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3973273038864136, "sampling/importance_sampling_ratio/mean": 1.0000078678131104, "sampling/importance_sampling_ratio/min": 0.6761813163757324, "sampling/sampling_logp_difference/max": 0.391294002532959, "sampling/sampling_logp_difference/mean": 0.013794781640172005, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 268.21875, "completions/mean_terminated_length": 268.21875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3459845185279846, "epoch": 0.6372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.018214372495190298, "kl": 0.025208748877048492, "learning_rate": 9.626521502369983e-07, "loss": 0.0002, "num_tokens": 16396423.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6887457370758057, "sampling/importance_sampling_ratio/mean": 1.0004690885543823, "sampling/importance_sampling_ratio/min": 0.5917854309082031, "sampling/sampling_logp_difference/max": 0.5246111154556274, "sampling/sampling_logp_difference/mean": 0.012911073863506317, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 205.546875, "completions/mean_terminated_length": 205.546875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3549535572528839, "epoch": 0.6384803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.021210397741436706, "kl": 0.026987168937921524, "learning_rate": 9.623815283866015e-07, "loss": 0.0003, "num_tokens": 16425770.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3158849477767944, "sampling/importance_sampling_ratio/mean": 0.9997355937957764, "sampling/importance_sampling_ratio/min": 0.6550066471099854, "sampling/sampling_logp_difference/max": 0.42310988903045654, "sampling/sampling_logp_difference/mean": 0.013649879954755306, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 157.140625, "completions/mean_terminated_length": 157.140625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3509664833545685, "epoch": 0.6397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.0332528861772406, "kl": 0.033507656306028366, "learning_rate": 9.621099679309946e-07, "loss": 0.0003, "num_tokens": 16452371.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5231109857559204, "sampling/importance_sampling_ratio/mean": 1.000582218170166, "sampling/importance_sampling_ratio/min": 0.6256922483444214, "sampling/sampling_logp_difference/max": 0.46889662742614746, "sampling/sampling_logp_difference/mean": 0.015148179605603218, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 176.34375, "completions/mean_terminated_length": 176.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.33977288007736206, "epoch": 0.6409313725490197, "frac_reward_zero_std": 0.75, "grad_norm": 1.0450745515105564, "kl": 0.030788611620664597, "learning_rate": 9.618374694214285e-07, "loss": 0.0184, "num_tokens": 16477753.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3973685503005981, "sampling/importance_sampling_ratio/mean": 1.0008265972137451, "sampling/importance_sampling_ratio/min": 0.6237364411354065, "sampling/sampling_logp_difference/max": 0.4720273017883301, "sampling/sampling_logp_difference/mean": 0.013767888769507408, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 235.328125, "completions/mean_terminated_length": 235.328125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4321310818195343, "epoch": 0.6421568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.02730276239242315, "kl": 0.03425852209329605, "learning_rate": 9.615640334110578e-07, "loss": 0.0003, "num_tokens": 16515166.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6193687915802002, "sampling/importance_sampling_ratio/mean": 1.0003039836883545, "sampling/importance_sampling_ratio/min": 0.6216926574707031, "sampling/sampling_logp_difference/max": 0.4820363521575928, "sampling/sampling_logp_difference/mean": 0.014964373782277107, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4211269021034241, "epoch": 0.6433823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 0.9895386822369383, "kl": 0.031260281801223755, "learning_rate": 9.612896604549401e-07, "loss": 0.0008, "num_tokens": 16542158.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3415215015411377, "sampling/importance_sampling_ratio/mean": 0.9998394250869751, "sampling/importance_sampling_ratio/min": 0.5685389041900635, "sampling/sampling_logp_difference/max": 0.5646855235099792, "sampling/sampling_logp_difference/mean": 0.01657898537814617, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 210.1875, "completions/mean_terminated_length": 210.1875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.39013925194740295, "epoch": 0.6446078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.021518376420771413, "kl": 0.025803860276937485, "learning_rate": 9.610143511100354e-07, "loss": 0.0003, "num_tokens": 16570490.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.507156491279602, "sampling/importance_sampling_ratio/mean": 1.000083565711975, "sampling/importance_sampling_ratio/min": 0.6335555911064148, "sampling/sampling_logp_difference/max": 0.4564075469970703, "sampling/sampling_logp_difference/mean": 0.014585510827600956, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 228.59375, "completions/mean_terminated_length": 228.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.37804198265075684, "epoch": 0.6458333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.6245177947861602, "kl": 0.030640294775366783, "learning_rate": 9.607381059352038e-07, "loss": 0.0113, "num_tokens": 16605312.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.611337423324585, "sampling/importance_sampling_ratio/mean": 0.9999603033065796, "sampling/importance_sampling_ratio/min": 0.6217000484466553, "sampling/sampling_logp_difference/max": 0.4770646095275879, "sampling/sampling_logp_difference/mean": 0.014867281541228294, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 191.8125, "completions/mean_terminated_length": 191.8125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.353499174118042, "epoch": 0.6470588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.01948708633875749, "kl": 0.023790864273905754, "learning_rate": 9.60460925491206e-07, "loss": 0.0002, "num_tokens": 16638004.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4038140773773193, "sampling/importance_sampling_ratio/mean": 0.9999080896377563, "sampling/importance_sampling_ratio/min": 0.6374849677085876, "sampling/sampling_logp_difference/max": 0.4502246379852295, "sampling/sampling_logp_difference/mean": 0.014271529391407967, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 159.15625, "completions/mean_terminated_length": 159.15625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4149744510650635, "epoch": 0.6482843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9070816966077233, "kl": 0.03448011726140976, "learning_rate": 9.601828103407004e-07, "loss": -0.0021, "num_tokens": 16669070.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4417411088943481, "sampling/importance_sampling_ratio/mean": 0.9999873638153076, "sampling/importance_sampling_ratio/min": 0.7591252326965332, "sampling/sampling_logp_difference/max": 0.36585140228271484, "sampling/sampling_logp_difference/mean": 0.015302242711186409, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 204.765625, "completions/mean_terminated_length": 204.765625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3343036472797394, "epoch": 0.6495098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.019794629333848835, "kl": 0.02628006413578987, "learning_rate": 9.599037610482433e-07, "loss": 0.0003, "num_tokens": 16699983.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.290763258934021, "sampling/importance_sampling_ratio/mean": 1.0000771284103394, "sampling/importance_sampling_ratio/min": 0.6042301058769226, "sampling/sampling_logp_difference/max": 0.5038001537322998, "sampling/sampling_logp_difference/mean": 0.014323609881103039, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 182.328125, "completions/mean_terminated_length": 182.328125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3478272259235382, "epoch": 0.6507352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02157603009356838, "kl": 0.02507212944328785, "learning_rate": 9.59623778180287e-07, "loss": 0.0002, "num_tokens": 16730036.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.432883381843567, "sampling/importance_sampling_ratio/mean": 1.0006635189056396, "sampling/importance_sampling_ratio/min": 0.7574724555015564, "sampling/sampling_logp_difference/max": 0.35968875885009766, "sampling/sampling_logp_difference/mean": 0.014315593987703323, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.41078031063079834, "epoch": 0.6519607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.0024160231052435, "kl": 0.03048519417643547, "learning_rate": 9.593428623051791e-07, "loss": -0.0255, "num_tokens": 16759796.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4374518394470215, "sampling/importance_sampling_ratio/mean": 0.9993870258331299, "sampling/importance_sampling_ratio/min": 0.6051985621452332, "sampling/sampling_logp_difference/max": 0.5021986961364746, "sampling/sampling_logp_difference/mean": 0.016140135005116463, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 192.734375, "completions/mean_terminated_length": 192.734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35647279024124146, "epoch": 0.6531862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.019612389785877708, "kl": 0.02538762241601944, "learning_rate": 9.59061013993161e-07, "loss": 0.0002, "num_tokens": 16793795.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5227993726730347, "sampling/importance_sampling_ratio/mean": 0.9999618530273438, "sampling/importance_sampling_ratio/min": 0.6399714350700378, "sampling/sampling_logp_difference/max": 0.4463317394256592, "sampling/sampling_logp_difference/mean": 0.013458995148539543, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 209.6875, "completions/mean_terminated_length": 209.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3694957196712494, "epoch": 0.6544117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 0.5830303275611716, "kl": 0.027595968917012215, "learning_rate": 9.587782338163667e-07, "loss": -0.0027, "num_tokens": 16826191.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.543566346168518, "sampling/importance_sampling_ratio/mean": 1.000403642654419, "sampling/importance_sampling_ratio/min": 0.6606270670890808, "sampling/sampling_logp_difference/max": 0.43409550189971924, "sampling/sampling_logp_difference/mean": 0.013270800933241844, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 218.65625, "completions/mean_terminated_length": 218.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4609197676181793, "epoch": 0.6556372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.021030880073322496, "kl": 0.02733190357685089, "learning_rate": 9.584945223488226e-07, "loss": 0.0003, "num_tokens": 16861129.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2990570068359375, "sampling/importance_sampling_ratio/mean": 0.999771237373352, "sampling/importance_sampling_ratio/min": 0.6057155132293701, "sampling/sampling_logp_difference/max": 0.5013449192047119, "sampling/sampling_logp_difference/mean": 0.014958461746573448, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.44152161478996277, "epoch": 0.6568627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.021595070807418862, "kl": 0.025443298742175102, "learning_rate": 9.582098801664443e-07, "loss": 0.0003, "num_tokens": 16893961.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4320579767227173, "sampling/importance_sampling_ratio/mean": 0.9998829364776611, "sampling/importance_sampling_ratio/min": 0.6851339340209961, "sampling/sampling_logp_difference/max": 0.378140926361084, "sampling/sampling_logp_difference/mean": 0.015051309950649738, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 229.4375, "completions/mean_terminated_length": 229.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.39369893074035645, "epoch": 0.6580882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.019634264798132458, "kl": 0.027372129261493683, "learning_rate": 9.579243078470378e-07, "loss": 0.0003, "num_tokens": 16929557.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.485055685043335, "sampling/importance_sampling_ratio/mean": 0.9997790455818176, "sampling/importance_sampling_ratio/min": 0.5497868061065674, "sampling/sampling_logp_difference/max": 0.5982246398925781, "sampling/sampling_logp_difference/mean": 0.014482814818620682, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 204.125, "completions/mean_terminated_length": 204.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.5653814077377319, "epoch": 0.6593137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.9988587526287372, "kl": 0.03801679238677025, "learning_rate": 9.576378059702968e-07, "loss": -0.0023, "num_tokens": 16963741.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.436959981918335, "sampling/importance_sampling_ratio/mean": 1.0001435279846191, "sampling/importance_sampling_ratio/min": 0.7102527022361755, "sampling/sampling_logp_difference/max": 0.3625297546386719, "sampling/sampling_logp_difference/mean": 0.017763778567314148, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 205.3125, "completions/mean_terminated_length": 205.3125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.43430545926094055, "epoch": 0.6605392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.021384107168272663, "kl": 0.03111300617456436, "learning_rate": 9.573503751178018e-07, "loss": 0.0003, "num_tokens": 16996177.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.643847942352295, "sampling/importance_sampling_ratio/mean": 0.9997886419296265, "sampling/importance_sampling_ratio/min": 0.6688855290412903, "sampling/sampling_logp_difference/max": 0.497039794921875, "sampling/sampling_logp_difference/mean": 0.014810606837272644, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 181.546875, "completions/mean_terminated_length": 181.546875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5582599639892578, "epoch": 0.6617647058823529, "frac_reward_zero_std": 0.5, "grad_norm": 1.3363543230620318, "kl": 0.038329631090164185, "learning_rate": 9.570620158730194e-07, "loss": 0.0041, "num_tokens": 17034692.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3978967666625977, "sampling/importance_sampling_ratio/mean": 1.0004541873931885, "sampling/importance_sampling_ratio/min": 0.6934731006622314, "sampling/sampling_logp_difference/max": 0.3660428524017334, "sampling/sampling_logp_difference/mean": 0.01722395420074463, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 199.890625, "completions/mean_terminated_length": 199.890625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4284636974334717, "epoch": 0.6629901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.024349416401760998, "kl": 0.03167618438601494, "learning_rate": 9.567727288213004e-07, "loss": 0.0003, "num_tokens": 17064957.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.341268539428711, "sampling/importance_sampling_ratio/mean": 1.0001959800720215, "sampling/importance_sampling_ratio/min": 0.4498509168624878, "sampling/sampling_logp_difference/max": 0.7988390922546387, "sampling/sampling_logp_difference/mean": 0.01536840945482254, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.39415794610977173, "epoch": 0.6642156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.03048027674448327, "kl": 0.03377959877252579, "learning_rate": 9.564825145498793e-07, "loss": 0.0003, "num_tokens": 17096053.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4786802530288696, "sampling/importance_sampling_ratio/mean": 1.0002336502075195, "sampling/importance_sampling_ratio/min": 0.6484001278877258, "sampling/sampling_logp_difference/max": 0.43324732780456543, "sampling/sampling_logp_difference/mean": 0.014283658936619759, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 187.53125, "completions/mean_terminated_length": 187.53125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4927505850791931, "epoch": 0.6654411764705882, "frac_reward_zero_std": 0.5, "grad_norm": 1.3669374222846087, "kl": 0.03446304798126221, "learning_rate": 9.561913736478728e-07, "loss": 0.0567, "num_tokens": 17128455.0, "reward": 0.21875, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6559178829193115, "sampling/importance_sampling_ratio/mean": 1.000101089477539, "sampling/importance_sampling_ratio/min": 0.6176440119743347, "sampling/sampling_logp_difference/max": 0.5043554306030273, "sampling/sampling_logp_difference/mean": 0.01626628078520298, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 186.734375, "completions/mean_terminated_length": 186.734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3815724551677704, "epoch": 0.6666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.021800493826795702, "kl": 0.02597838081419468, "learning_rate": 9.558993067062784e-07, "loss": 0.0003, "num_tokens": 17157414.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3621898889541626, "sampling/importance_sampling_ratio/mean": 1.000091314315796, "sampling/importance_sampling_ratio/min": 0.5554914474487305, "sampling/sampling_logp_difference/max": 0.5879020690917969, "sampling/sampling_logp_difference/mean": 0.014378965832293034, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 206.6875, "completions/mean_terminated_length": 206.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.42119675874710083, "epoch": 0.6678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9023483510896259, "kl": 0.028189023956656456, "learning_rate": 9.556063143179735e-07, "loss": 0.0198, "num_tokens": 17194594.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3934311866760254, "sampling/importance_sampling_ratio/mean": 0.9997861981391907, "sampling/importance_sampling_ratio/min": 0.6242069005966187, "sampling/sampling_logp_difference/max": 0.47127342224121094, "sampling/sampling_logp_difference/mean": 0.014654896222054958, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 173.03125, "completions/mean_terminated_length": 173.03125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4660423696041107, "epoch": 0.6691176470588235, "frac_reward_zero_std": 0.75, "grad_norm": 1.0264795769518509, "kl": 0.034461360424757004, "learning_rate": 9.55312397077714e-07, "loss": 0.0338, "num_tokens": 17222260.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.808357834815979, "sampling/importance_sampling_ratio/mean": 0.9998067617416382, "sampling/importance_sampling_ratio/min": 0.7175599932670593, "sampling/sampling_logp_difference/max": 0.5924191474914551, "sampling/sampling_logp_difference/mean": 0.01736539974808693, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 218.71875, "completions/mean_terminated_length": 218.71875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.34565114974975586, "epoch": 0.6703431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0188774558366317, "kl": 0.024478325620293617, "learning_rate": 9.550175555821334e-07, "loss": 0.0002, "num_tokens": 17255666.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6260552406311035, "sampling/importance_sampling_ratio/mean": 0.9997800588607788, "sampling/importance_sampling_ratio/min": 0.6128374934196472, "sampling/sampling_logp_difference/max": 0.4896554946899414, "sampling/sampling_logp_difference/mean": 0.013118880800902843, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 187.59375, "completions/mean_terminated_length": 187.59375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4557947814464569, "epoch": 0.6715686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.9563803894898965, "kl": 0.02949117124080658, "learning_rate": 9.547217904297409e-07, "loss": 0.0284, "num_tokens": 17285576.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4972301721572876, "sampling/importance_sampling_ratio/mean": 1.0004384517669678, "sampling/importance_sampling_ratio/min": 0.674802303314209, "sampling/sampling_logp_difference/max": 0.40361690521240234, "sampling/sampling_logp_difference/mean": 0.016303321346640587, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 172.328125, "completions/mean_terminated_length": 172.328125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4239822030067444, "epoch": 0.6727941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.025517293583861147, "kl": 0.03518486022949219, "learning_rate": 9.544251022209216e-07, "loss": 0.0004, "num_tokens": 17320141.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744264125823975, "sampling/importance_sampling_ratio/mean": 1.0005159378051758, "sampling/importance_sampling_ratio/min": 0.7415908575057983, "sampling/sampling_logp_difference/max": 0.4538910388946533, "sampling/sampling_logp_difference/mean": 0.015085499733686447, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.29904037714004517, "epoch": 0.6740196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.024688332129142475, "kl": 0.02920544147491455, "learning_rate": 9.541274915579334e-07, "loss": 0.0003, "num_tokens": 17344965.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4720820188522339, "sampling/importance_sampling_ratio/mean": 0.9996750354766846, "sampling/importance_sampling_ratio/min": 0.4936216473579407, "sampling/sampling_logp_difference/max": 0.7059860229492188, "sampling/sampling_logp_difference/mean": 0.013709956780076027, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 208.609375, "completions/mean_terminated_length": 208.609375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.392837256193161, "epoch": 0.6752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.020842003603693177, "kl": 0.02458246424794197, "learning_rate": 9.538289590449071e-07, "loss": 0.0002, "num_tokens": 17377772.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5609105825424194, "sampling/importance_sampling_ratio/mean": 0.9998144507408142, "sampling/importance_sampling_ratio/min": 0.590291440486908, "sampling/sampling_logp_difference/max": 0.5271389484405518, "sampling/sampling_logp_difference/mean": 0.013637501746416092, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.40312328934669495, "epoch": 0.6764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.027996955651263342, "kl": 0.0280432291328907, "learning_rate": 9.535295052878449e-07, "loss": 0.0003, "num_tokens": 17408492.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4875401258468628, "sampling/importance_sampling_ratio/mean": 1.0007259845733643, "sampling/importance_sampling_ratio/min": 0.7447932362556458, "sampling/sampling_logp_difference/max": 0.3971238136291504, "sampling/sampling_logp_difference/mean": 0.014545347541570663, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 133.171875, "completions/mean_terminated_length": 133.171875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.30149027705192566, "epoch": 0.6776960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.034764274567367234, "kl": 0.03032582998275757, "learning_rate": 9.53229130894619e-07, "loss": 0.0003, "num_tokens": 17433271.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5182085037231445, "sampling/importance_sampling_ratio/mean": 0.9995010495185852, "sampling/importance_sampling_ratio/min": 0.6125888228416443, "sampling/sampling_logp_difference/max": 0.49006128311157227, "sampling/sampling_logp_difference/mean": 0.014048278331756592, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.45025965571403503, "epoch": 0.678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.2667513377636765, "kl": 0.035085082054138184, "learning_rate": 9.529278364749702e-07, "loss": 0.0295, "num_tokens": 17465835.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4873110055923462, "sampling/importance_sampling_ratio/mean": 0.9998199343681335, "sampling/importance_sampling_ratio/min": 0.6853278279304504, "sampling/sampling_logp_difference/max": 0.3969697952270508, "sampling/sampling_logp_difference/mean": 0.016888249665498734, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 191.28125, "completions/mean_terminated_length": 191.28125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4631187319755554, "epoch": 0.6801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.3981282689064027, "kl": 0.03789568692445755, "learning_rate": 9.526256226405073e-07, "loss": -0.0016, "num_tokens": 17494589.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4454609155654907, "sampling/importance_sampling_ratio/mean": 0.9997762441635132, "sampling/importance_sampling_ratio/min": 0.4871583580970764, "sampling/sampling_logp_difference/max": 0.7191660404205322, "sampling/sampling_logp_difference/mean": 0.015756256878376007, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 155.90625, "completions/mean_terminated_length": 155.90625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3670254349708557, "epoch": 0.6813725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.9368330927073913, "kl": 0.03317684680223465, "learning_rate": 9.523224900047051e-07, "loss": 0.0142, "num_tokens": 17518631.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4525972604751587, "sampling/importance_sampling_ratio/mean": 0.9997410774230957, "sampling/importance_sampling_ratio/min": 0.677254319190979, "sampling/sampling_logp_difference/max": 0.38970839977264404, "sampling/sampling_logp_difference/mean": 0.014513498172163963, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 168.484375, "completions/mean_terminated_length": 168.484375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4081436097621918, "epoch": 0.6825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02472310940055831, "kl": 0.02606888674199581, "learning_rate": 9.520184391829036e-07, "loss": 0.0003, "num_tokens": 17550422.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6272908449172974, "sampling/importance_sampling_ratio/mean": 1.0004005432128906, "sampling/importance_sampling_ratio/min": 0.6198011636734009, "sampling/sampling_logp_difference/max": 0.48691654205322266, "sampling/sampling_logp_difference/mean": 0.015636038035154343, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 181.203125, "completions/mean_terminated_length": 181.203125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4906782805919647, "epoch": 0.6838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.06104910140848077, "kl": 0.04024219140410423, "learning_rate": 9.517134707923069e-07, "loss": 0.0004, "num_tokens": 17582147.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8212308883666992, "sampling/importance_sampling_ratio/mean": 1.00010347366333, "sampling/importance_sampling_ratio/min": 0.6550912261009216, "sampling/sampling_logp_difference/max": 0.5995125770568848, "sampling/sampling_logp_difference/mean": 0.018436823040246964, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 167.4375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.36911672353744507, "epoch": 0.6850490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.04059392221494458, "kl": 0.03382398188114166, "learning_rate": 9.514075854519813e-07, "loss": 0.0003, "num_tokens": 17609327.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4043327569961548, "sampling/importance_sampling_ratio/mean": 0.9991638660430908, "sampling/importance_sampling_ratio/min": 0.6094186902046204, "sampling/sampling_logp_difference/max": 0.49524974822998047, "sampling/sampling_logp_difference/mean": 0.014528224244713783, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 194.359375, "completions/mean_terminated_length": 194.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4601079523563385, "epoch": 0.6862745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.8419090426673429, "kl": 0.03641054779291153, "learning_rate": 9.511007837828548e-07, "loss": -0.0052, "num_tokens": 17643894.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5000439882278442, "sampling/importance_sampling_ratio/mean": 1.0002095699310303, "sampling/importance_sampling_ratio/min": 0.6145036816596985, "sampling/sampling_logp_difference/max": 0.4869403839111328, "sampling/sampling_logp_difference/mean": 0.016088012605905533, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 226.3125, "completions/mean_terminated_length": 226.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.43548086285591125, "epoch": 0.6875, "frac_reward_zero_std": 0.75, "grad_norm": 0.6768620242535359, "kl": 0.025557810440659523, "learning_rate": 9.507930664077153e-07, "loss": 0.0145, "num_tokens": 17681338.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3931565284729004, "sampling/importance_sampling_ratio/mean": 0.9997798800468445, "sampling/importance_sampling_ratio/min": 0.5388790965080261, "sampling/sampling_logp_difference/max": 0.6182640194892883, "sampling/sampling_logp_difference/mean": 0.015264216810464859, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3668908476829529, "epoch": 0.6887254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.022978793147968482, "kl": 0.024108467623591423, "learning_rate": 9.504844339512094e-07, "loss": 0.0002, "num_tokens": 17713066.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000882148742676, "sampling/importance_sampling_ratio/min": 0.729113757610321, "sampling/sampling_logp_difference/max": 0.7371416091918945, "sampling/sampling_logp_difference/mean": 0.012965286150574684, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 149.734375, "completions/mean_terminated_length": 149.734375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.46979600191116333, "epoch": 0.6899509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 1.1338108304088952, "kl": 0.058991335332393646, "learning_rate": 9.501748870398419e-07, "loss": 0.0062, "num_tokens": 17740009.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6401453018188477, "sampling/importance_sampling_ratio/mean": 1.00006103515625, "sampling/importance_sampling_ratio/min": 0.6931049823760986, "sampling/sampling_logp_difference/max": 0.4947848320007324, "sampling/sampling_logp_difference/mean": 0.016827845945954323, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3002978265285492, "epoch": 0.6911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03904848237390688, "kl": 0.03209100291132927, "learning_rate": 9.498644263019731e-07, "loss": 0.0003, "num_tokens": 17769921.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4301660060882568, "sampling/importance_sampling_ratio/mean": 1.0002093315124512, "sampling/importance_sampling_ratio/min": 0.6262974739074707, "sampling/sampling_logp_difference/max": 0.4679298400878906, "sampling/sampling_logp_difference/mean": 0.013269246555864811, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 179.953125, "completions/mean_terminated_length": 179.953125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.5559252500534058, "epoch": 0.6924019607843137, "frac_reward_zero_std": 0.0, "grad_norm": 1.703238584274116, "kl": 0.06438381224870682, "learning_rate": 9.495530523678186e-07, "loss": 0.0092, "num_tokens": 17801038.0, "reward": -0.21875, "reward_std": 0.676956295967102, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5438649654388428, "sampling/importance_sampling_ratio/mean": 1.000163197517395, "sampling/importance_sampling_ratio/min": 0.6384997963905334, "sampling/sampling_logp_difference/max": 0.44863390922546387, "sampling/sampling_logp_difference/mean": 0.01882471702992916, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 186.296875, "completions/mean_terminated_length": 186.296875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.46670347452163696, "epoch": 0.6936274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 1.2810627770010172, "kl": 0.05042688176035881, "learning_rate": 9.492407658694477e-07, "loss": -0.0106, "num_tokens": 17828849.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.465358853340149, "sampling/importance_sampling_ratio/mean": 1.0000073909759521, "sampling/importance_sampling_ratio/min": 0.6191340684890747, "sampling/sampling_logp_difference/max": 0.479433536529541, "sampling/sampling_logp_difference/mean": 0.01790396124124527, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 173.859375, "completions/mean_terminated_length": 173.859375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.44244450330734253, "epoch": 0.6948529411764706, "frac_reward_zero_std": 0.25, "grad_norm": 1.5968482256097214, "kl": 0.05212203413248062, "learning_rate": 9.489275674407825e-07, "loss": -0.01, "num_tokens": 17856760.0, "reward": 0.34375, "reward_std": 0.5809217691421509, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5171775817871094, "sampling/importance_sampling_ratio/mean": 0.999706506729126, "sampling/importance_sampling_ratio/min": 0.641122043132782, "sampling/sampling_logp_difference/max": 0.444535493850708, "sampling/sampling_logp_difference/mean": 0.016785025596618652, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 133.09375, "completions/mean_terminated_length": 133.09375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.36766040325164795, "epoch": 0.696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.06045360132262736, "kl": 0.036403343081474304, "learning_rate": 9.486134577175957e-07, "loss": 0.0004, "num_tokens": 17879662.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4707084894180298, "sampling/importance_sampling_ratio/mean": 0.9995070695877075, "sampling/importance_sampling_ratio/min": 0.7074356079101562, "sampling/sampling_logp_difference/max": 0.38574421405792236, "sampling/sampling_logp_difference/mean": 0.01508602686226368, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 178.390625, "completions/mean_terminated_length": 178.390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.43458572030067444, "epoch": 0.6973039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.5381952728820387, "kl": 0.04792370647192001, "learning_rate": 9.482984373375104e-07, "loss": 0.0129, "num_tokens": 17910615.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4773439168930054, "sampling/importance_sampling_ratio/mean": 1.0000048875808716, "sampling/importance_sampling_ratio/min": 0.7157042026519775, "sampling/sampling_logp_difference/max": 0.39024579524993896, "sampling/sampling_logp_difference/mean": 0.017831791192293167, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.48558875918388367, "epoch": 0.6985294117647058, "frac_reward_zero_std": 0.25, "grad_norm": 1.6476404351655827, "kl": 0.06947138905525208, "learning_rate": 9.479825069399977e-07, "loss": -0.0056, "num_tokens": 17940721.0, "reward": 0.125, "reward_std": 0.6831300258636475, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.7248539924621582, "sampling/importance_sampling_ratio/mean": 0.9997266530990601, "sampling/importance_sampling_ratio/min": 0.6415937542915344, "sampling/sampling_logp_difference/max": 0.545142412185669, "sampling/sampling_logp_difference/mean": 0.01905631273984909, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 196.40625, "completions/mean_terminated_length": 196.40625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4202972948551178, "epoch": 0.6997549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.2268689030320934, "kl": 0.03882734477519989, "learning_rate": 9.476656671663766e-07, "loss": -0.0139, "num_tokens": 17975355.0, "reward": 0.6875, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5569466352462769, "sampling/importance_sampling_ratio/mean": 0.9996051788330078, "sampling/importance_sampling_ratio/min": 0.6746933460235596, "sampling/sampling_logp_difference/max": 0.44272661209106445, "sampling/sampling_logp_difference/mean": 0.015455996617674828, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.34861987829208374, "epoch": 0.7009803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.03036379946318668, "kl": 0.032124996185302734, "learning_rate": 9.473479186598114e-07, "loss": 0.0003, "num_tokens": 18002155.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.428978443145752, "sampling/importance_sampling_ratio/mean": 1.0002634525299072, "sampling/importance_sampling_ratio/min": 0.6447941064834595, "sampling/sampling_logp_difference/max": 0.43882429599761963, "sampling/sampling_logp_difference/mean": 0.016839023679494858, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 183.921875, "completions/mean_terminated_length": 183.921875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3184646964073181, "epoch": 0.7022058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.8855265123782843, "kl": 0.03036251664161682, "learning_rate": 9.470292620653119e-07, "loss": -0.019, "num_tokens": 18030918.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5637354850769043, "sampling/importance_sampling_ratio/mean": 0.9997586011886597, "sampling/importance_sampling_ratio/min": 0.6622517704963684, "sampling/sampling_logp_difference/max": 0.44707751274108887, "sampling/sampling_logp_difference/mean": 0.013248957693576813, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 232.34375, "completions/mean_terminated_length": 232.34375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4914621412754059, "epoch": 0.7034313725490197, "frac_reward_zero_std": 0.0, "grad_norm": 1.6498129315873309, "kl": 0.04989838972687721, "learning_rate": 9.467096980297304e-07, "loss": 0.0529, "num_tokens": 18063852.0, "reward": 0.6875, "reward_std": 0.690913200378418, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5468924045562744, "sampling/importance_sampling_ratio/mean": 0.9999454021453857, "sampling/importance_sampling_ratio/min": 0.6276541948318481, "sampling/sampling_logp_difference/max": 0.46576595306396484, "sampling/sampling_logp_difference/mean": 0.01687515154480934, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 232.953125, "completions/mean_terminated_length": 232.953125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.41729480028152466, "epoch": 0.7046568627450981, "frac_reward_zero_std": 0.25, "grad_norm": 1.327751073858522, "kl": 0.05011233687400818, "learning_rate": 9.463892272017618e-07, "loss": 0.0013, "num_tokens": 18099049.0, "reward": 0.53125, "reward_std": 0.7129635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007917881011963, "sampling/importance_sampling_ratio/min": 0.6794153451919556, "sampling/sampling_logp_difference/max": 0.846982479095459, "sampling/sampling_logp_difference/mean": 0.014939257875084877, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 184.359375, "completions/mean_terminated_length": 184.359375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4155169427394867, "epoch": 0.7058823529411765, "frac_reward_zero_std": 0.25, "grad_norm": 1.840841066458298, "kl": 0.040389060974121094, "learning_rate": 9.460678502319416e-07, "loss": -0.051, "num_tokens": 18126704.0, "reward": -0.03125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3847591876983643, "sampling/importance_sampling_ratio/mean": 0.999739408493042, "sampling/importance_sampling_ratio/min": 0.63115394115448, "sampling/sampling_logp_difference/max": 0.4602055549621582, "sampling/sampling_logp_difference/mean": 0.01599350944161415, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 288.578125, "completions/mean_terminated_length": 288.578125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.397461861371994, "epoch": 0.7071078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8568972578823097, "kl": 0.02606649324297905, "learning_rate": 9.457455677726447e-07, "loss": -0.002, "num_tokens": 18166325.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4671045541763306, "sampling/importance_sampling_ratio/mean": 0.9998229742050171, "sampling/importance_sampling_ratio/min": 0.6743451356887817, "sampling/sampling_logp_difference/max": 0.39401328563690186, "sampling/sampling_logp_difference/mean": 0.014239683747291565, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 298.015625, "completions/mean_terminated_length": 298.015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3736385107040405, "epoch": 0.7083333333333334, "frac_reward_zero_std": 0.75, "grad_norm": 0.6144241358813192, "kl": 0.026287073269486427, "learning_rate": 9.454223804780841e-07, "loss": 0.0106, "num_tokens": 18203526.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5859181880950928, "sampling/importance_sampling_ratio/mean": 1.000488519668579, "sampling/importance_sampling_ratio/min": 0.6167355179786682, "sampling/sampling_logp_difference/max": 0.48331499099731445, "sampling/sampling_logp_difference/mean": 0.013991307467222214, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.36850547790527344, "epoch": 0.7095588235294118, "frac_reward_zero_std": 0.5, "grad_norm": 1.1139940989323576, "kl": 0.03561537712812424, "learning_rate": 9.450982890043094e-07, "loss": -0.0003, "num_tokens": 18243430.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4850008487701416, "sampling/importance_sampling_ratio/mean": 0.9996840357780457, "sampling/importance_sampling_ratio/min": 0.6319139003753662, "sampling/sampling_logp_difference/max": 0.45900213718414307, "sampling/sampling_logp_difference/mean": 0.01356898620724678, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 295.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4161064624786377, "epoch": 0.7107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6783415704971543, "kl": 0.03330086171627045, "learning_rate": 9.447732940092059e-07, "loss": 0.0223, "num_tokens": 18283046.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.493503212928772, "sampling/importance_sampling_ratio/mean": 0.9998776912689209, "sampling/importance_sampling_ratio/min": 0.6947564482688904, "sampling/sampling_logp_difference/max": 0.4011244773864746, "sampling/sampling_logp_difference/mean": 0.015456114895641804, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 291.359375, "completions/mean_terminated_length": 291.359375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.45157065987586975, "epoch": 0.7120098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 0.8073574799969249, "kl": 0.03743080794811249, "learning_rate": 9.444473961524927e-07, "loss": -0.0227, "num_tokens": 18330797.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.592615008354187, "sampling/importance_sampling_ratio/mean": 1.0003517866134644, "sampling/importance_sampling_ratio/min": 0.45972323417663574, "sampling/sampling_logp_difference/max": 0.7771306037902832, "sampling/sampling_logp_difference/mean": 0.016215139999985695, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 270.40625, "completions/mean_terminated_length": 270.40625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.34220874309539795, "epoch": 0.7132352941176471, "frac_reward_zero_std": 0.75, "grad_norm": 0.8146313188462274, "kl": 0.034279175102710724, "learning_rate": 9.441205960957219e-07, "loss": 0.052, "num_tokens": 18367975.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.8494188785552979, "sampling/importance_sampling_ratio/mean": 0.9999802708625793, "sampling/importance_sampling_ratio/min": 0.6302123069763184, "sampling/sampling_logp_difference/max": 0.6148715019226074, "sampling/sampling_logp_difference/mean": 0.013569234870374203, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 226.015625, "completions/mean_terminated_length": 226.015625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3353632688522339, "epoch": 0.7144607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.05072026022762656, "kl": 0.04300938546657562, "learning_rate": 9.43792894502277e-07, "loss": 0.0004, "num_tokens": 18398888.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.521698236465454, "sampling/importance_sampling_ratio/mean": 1.0002022981643677, "sampling/importance_sampling_ratio/min": 0.608488917350769, "sampling/sampling_logp_difference/max": 0.4967765808105469, "sampling/sampling_logp_difference/mean": 0.014746149070560932, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 334.71875, "completions/mean_terminated_length": 334.71875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.416095495223999, "epoch": 0.7156862745098039, "frac_reward_zero_std": 0.5, "grad_norm": 0.8687483700560874, "kl": 0.03589534014463425, "learning_rate": 9.434642920373713e-07, "loss": -0.0082, "num_tokens": 18442230.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3913553953170776, "sampling/importance_sampling_ratio/mean": 0.9997699856758118, "sampling/importance_sampling_ratio/min": 0.7111809849739075, "sampling/sampling_logp_difference/max": 0.3408282995223999, "sampling/sampling_logp_difference/mean": 0.013978583738207817, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 252.78125, "completions/mean_terminated_length": 252.78125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3994787335395813, "epoch": 0.7169117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 0.7470459942229324, "kl": 0.048064589500427246, "learning_rate": 9.431347893680472e-07, "loss": -0.0062, "num_tokens": 18473192.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4770106077194214, "sampling/importance_sampling_ratio/mean": 0.9999533295631409, "sampling/importance_sampling_ratio/min": 0.6099969744682312, "sampling/sampling_logp_difference/max": 0.49430131912231445, "sampling/sampling_logp_difference/mean": 0.016215935349464417, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 266.203125, "completions/mean_terminated_length": 266.203125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2780820429325104, "epoch": 0.7181372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.05454274155724163, "kl": 0.0379292257130146, "learning_rate": 9.428043871631739e-07, "loss": 0.0004, "num_tokens": 18505957.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4753788709640503, "sampling/importance_sampling_ratio/mean": 1.0000276565551758, "sampling/importance_sampling_ratio/min": 0.6772292852401733, "sampling/sampling_logp_difference/max": 0.3897453546524048, "sampling/sampling_logp_difference/mean": 0.011633490212261677, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 378.421875, "completions/mean_terminated_length": 378.421875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.4514898359775543, "epoch": 0.7193627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.6391472605750311, "kl": 0.038221873342990875, "learning_rate": 9.424730860934472e-07, "loss": -0.0202, "num_tokens": 18553408.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5621294975280762, "sampling/importance_sampling_ratio/mean": 1.0005515813827515, "sampling/importance_sampling_ratio/min": 0.6183229684829712, "sampling/sampling_logp_difference/max": 0.4807443618774414, "sampling/sampling_logp_difference/mean": 0.015158019959926605, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 216.390625, "completions/mean_terminated_length": 216.390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.34938451647758484, "epoch": 0.7205882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.03940199990870377, "kl": 0.047378502786159515, "learning_rate": 9.421408868313873e-07, "loss": 0.0004, "num_tokens": 18581321.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4404903650283813, "sampling/importance_sampling_ratio/mean": 1.0000039339065552, "sampling/importance_sampling_ratio/min": 0.6431145071983337, "sampling/sampling_logp_difference/max": 0.44143247604370117, "sampling/sampling_logp_difference/mean": 0.014487783424556255, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 281.59375, "completions/mean_terminated_length": 281.59375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3534771203994751, "epoch": 0.7218137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.03664400527724794, "kl": 0.048589833080768585, "learning_rate": 9.418077900513376e-07, "loss": 0.0004, "num_tokens": 18616127.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5671957731246948, "sampling/importance_sampling_ratio/mean": 1.0001347064971924, "sampling/importance_sampling_ratio/min": 0.7090094685554504, "sampling/sampling_logp_difference/max": 0.44928789138793945, "sampling/sampling_logp_difference/mean": 0.013496083207428455, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 254.546875, "completions/mean_terminated_length": 254.546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4544410705566406, "epoch": 0.7230392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.16074988784042146, "kl": 0.05074925348162651, "learning_rate": 9.414737964294634e-07, "loss": 0.0005, "num_tokens": 18653218.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7520911693572998, "sampling/importance_sampling_ratio/mean": 1.0004611015319824, "sampling/importance_sampling_ratio/min": 0.09077741950750351, "sampling/sampling_logp_difference/max": 2.3993446826934814, "sampling/sampling_logp_difference/mean": 0.016986709088087082, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 343.40625, "completions/mean_terminated_length": 343.40625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.36447951197624207, "epoch": 0.7242647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02865242764091762, "kl": 0.03275573253631592, "learning_rate": 9.411389066437507e-07, "loss": 0.0003, "num_tokens": 18696428.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5326827764511108, "sampling/importance_sampling_ratio/mean": 1.0002455711364746, "sampling/importance_sampling_ratio/min": 0.6872919201850891, "sampling/sampling_logp_difference/max": 0.4270195960998535, "sampling/sampling_logp_difference/mean": 0.012679225765168667, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 274.859375, "completions/mean_terminated_length": 274.859375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.38698798418045044, "epoch": 0.7254901960784313, "frac_reward_zero_std": 0.5, "grad_norm": 0.900943564636299, "kl": 0.05070265009999275, "learning_rate": 9.408031213740044e-07, "loss": -0.0006, "num_tokens": 18730931.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6438260078430176, "sampling/importance_sampling_ratio/mean": 0.9993126392364502, "sampling/importance_sampling_ratio/min": 0.689705491065979, "sampling/sampling_logp_difference/max": 0.4970264434814453, "sampling/sampling_logp_difference/mean": 0.013545414432883263, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 358.34375, "completions/mean_terminated_length": 358.34375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.36991560459136963, "epoch": 0.7267156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.5301147887989228, "kl": 0.04598201811313629, "learning_rate": 9.404664413018476e-07, "loss": -0.0099, "num_tokens": 18775785.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7139068841934204, "sampling/importance_sampling_ratio/mean": 1.0006451606750488, "sampling/importance_sampling_ratio/min": 0.4161494970321655, "sampling/sampling_logp_difference/max": 0.8767106533050537, "sampling/sampling_logp_difference/mean": 0.014390267431735992, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 311.6875, "completions/mean_terminated_length": 311.6875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4156648516654968, "epoch": 0.7279411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 0.5723504706838156, "kl": 0.03343087434768677, "learning_rate": 9.401288671107193e-07, "loss": -0.0006, "num_tokens": 18816101.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6876107454299927, "sampling/importance_sampling_ratio/mean": 1.0000860691070557, "sampling/importance_sampling_ratio/min": 0.6515225768089294, "sampling/sampling_logp_difference/max": 0.5233137607574463, "sampling/sampling_logp_difference/mean": 0.0153332045301795, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 342.5625, "completions/mean_terminated_length": 342.5625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.40205198526382446, "epoch": 0.7291666666666666, "frac_reward_zero_std": 0.5, "grad_norm": 0.7865444424660837, "kl": 0.03984655812382698, "learning_rate": 9.397903994858735e-07, "loss": 0.0438, "num_tokens": 18857817.0, "reward": 0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4517771005630493, "sampling/importance_sampling_ratio/mean": 0.9999061822891235, "sampling/importance_sampling_ratio/min": 0.6904515624046326, "sampling/sampling_logp_difference/max": 0.3727884292602539, "sampling/sampling_logp_difference/mean": 0.012401677668094635, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 280.765625, "completions/mean_terminated_length": 280.765625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.36197447776794434, "epoch": 0.7303921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.040535233546695075, "kl": 0.049834877252578735, "learning_rate": 9.394510391143786e-07, "loss": 0.0004, "num_tokens": 18891242.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.42715585231781, "sampling/importance_sampling_ratio/mean": 0.9997456073760986, "sampling/importance_sampling_ratio/min": 0.6623404026031494, "sampling/sampling_logp_difference/max": 0.411975622177124, "sampling/sampling_logp_difference/mean": 0.013757757842540741, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 399.515625, "completions/mean_terminated_length": 399.515625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.34008845686912537, "epoch": 0.7316176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.025416439014180003, "kl": 0.030293263494968414, "learning_rate": 9.391107866851142e-07, "loss": 0.0003, "num_tokens": 18949787.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6054352521896362, "sampling/importance_sampling_ratio/mean": 1.0004926919937134, "sampling/importance_sampling_ratio/min": 0.5919367074966431, "sampling/sampling_logp_difference/max": 0.5243555307388306, "sampling/sampling_logp_difference/mean": 0.012024283409118652, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 234.0625, "completions/mean_terminated_length": 234.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4019208550453186, "epoch": 0.7328431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.04831611945362319, "kl": 0.057452306151390076, "learning_rate": 9.387696428887715e-07, "loss": 0.0005, "num_tokens": 18979167.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6169447898864746, "sampling/importance_sampling_ratio/mean": 1.0006022453308105, "sampling/importance_sampling_ratio/min": 0.5576868057250977, "sampling/sampling_logp_difference/max": 0.5839577913284302, "sampling/sampling_logp_difference/mean": 0.0151644516736269, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3913234770298004, "epoch": 0.7340686274509803, "frac_reward_zero_std": 0.75, "grad_norm": 0.8912620717071558, "kl": 0.07460720092058182, "learning_rate": 9.384276084178504e-07, "loss": 0.0116, "num_tokens": 19007551.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.448309302330017, "sampling/importance_sampling_ratio/mean": 0.9996531009674072, "sampling/importance_sampling_ratio/min": 0.6317389011383057, "sampling/sampling_logp_difference/max": 0.45927906036376953, "sampling/sampling_logp_difference/mean": 0.01611633598804474, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 258.28125, "completions/mean_terminated_length": 258.28125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.47644010186195374, "epoch": 0.7352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.03643086357138928, "kl": 0.04496239125728607, "learning_rate": 9.380846839666595e-07, "loss": 0.0004, "num_tokens": 19056913.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7568045854568481, "sampling/importance_sampling_ratio/mean": 1.0003657341003418, "sampling/importance_sampling_ratio/min": 0.6972380876541138, "sampling/sampling_logp_difference/max": 0.5634965896606445, "sampling/sampling_logp_difference/mean": 0.014628026634454727, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 273.390625, "completions/mean_terminated_length": 273.390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3627666234970093, "epoch": 0.7365196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.03865247668415776, "kl": 0.05126260966062546, "learning_rate": 9.377408702313136e-07, "loss": 0.0005, "num_tokens": 19090426.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4355396032333374, "sampling/importance_sampling_ratio/mean": 0.9997859001159668, "sampling/importance_sampling_ratio/min": 0.6249046325683594, "sampling/sampling_logp_difference/max": 0.470156192779541, "sampling/sampling_logp_difference/mean": 0.014393947087228298, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 242.046875, "completions/mean_terminated_length": 242.046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3892621397972107, "epoch": 0.7377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03507889196843655, "kl": 0.0562661848962307, "learning_rate": 9.37396167909733e-07, "loss": 0.0005, "num_tokens": 19124669.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7792092561721802, "sampling/importance_sampling_ratio/mean": 0.9994032979011536, "sampling/importance_sampling_ratio/min": 0.5018308758735657, "sampling/sampling_logp_difference/max": 0.6894922256469727, "sampling/sampling_logp_difference/mean": 0.01530286855995655, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 262.828125, "completions/mean_terminated_length": 262.828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.372028648853302, "epoch": 0.7389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.051174383054545663, "kl": 0.048788998275995255, "learning_rate": 9.370505777016413e-07, "loss": 0.0005, "num_tokens": 19157810.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744249820709229, "sampling/importance_sampling_ratio/mean": 1.0003635883331299, "sampling/importance_sampling_ratio/min": 0.7011212706565857, "sampling/sampling_logp_difference/max": 0.4538900852203369, "sampling/sampling_logp_difference/mean": 0.014637460000813007, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 245.4375, "completions/mean_terminated_length": 245.4375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.40267762541770935, "epoch": 0.7401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.03602906711181945, "kl": 0.0445004478096962, "learning_rate": 9.367041003085648e-07, "loss": 0.0004, "num_tokens": 19192510.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6365485191345215, "sampling/importance_sampling_ratio/mean": 1.0004836320877075, "sampling/importance_sampling_ratio/min": 0.6401402354240417, "sampling/sampling_logp_difference/max": 0.49258947372436523, "sampling/sampling_logp_difference/mean": 0.016224956139922142, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 296.890625, "completions/mean_terminated_length": 296.890625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.4386707544326782, "epoch": 0.741421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.04674979673877471, "kl": 0.03908385708928108, "learning_rate": 9.363567364338307e-07, "loss": 0.0004, "num_tokens": 19233335.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3713226318359375, "sampling/importance_sampling_ratio/mean": 0.9998978972434998, "sampling/importance_sampling_ratio/min": 0.695604681968689, "sampling/sampling_logp_difference/max": 0.362973690032959, "sampling/sampling_logp_difference/mean": 0.014243248850107193, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 266.96875, "completions/mean_terminated_length": 266.96875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3504762351512909, "epoch": 0.7426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6962367993223767, "kl": 0.054517414420843124, "learning_rate": 9.360084867825658e-07, "loss": 0.013, "num_tokens": 19270261.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6438707113265991, "sampling/importance_sampling_ratio/mean": 1.000388264656067, "sampling/importance_sampling_ratio/min": 0.6260436773300171, "sampling/sampling_logp_difference/max": 0.4970536231994629, "sampling/sampling_logp_difference/mean": 0.01413656771183014, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 246.703125, "completions/mean_terminated_length": 246.703125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.39139968156814575, "epoch": 0.7438725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.029854538741684488, "kl": 0.045577920973300934, "learning_rate": 9.356593520616946e-07, "loss": 0.0004, "num_tokens": 19315426.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2952314615249634, "sampling/importance_sampling_ratio/mean": 1.0002150535583496, "sampling/importance_sampling_ratio/min": 0.698577344417572, "sampling/sampling_logp_difference/max": 0.35870933532714844, "sampling/sampling_logp_difference/mean": 0.013229596428573132, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 246.078125, "completions/mean_terminated_length": 246.078125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4169134795665741, "epoch": 0.7450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7135715337463644, "kl": 0.049882128834724426, "learning_rate": 9.353093329799386e-07, "loss": 0.0057, "num_tokens": 19345783.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4834805727005005, "sampling/importance_sampling_ratio/mean": 0.9999332427978516, "sampling/importance_sampling_ratio/min": 0.6186323165893555, "sampling/sampling_logp_difference/max": 0.48024415969848633, "sampling/sampling_logp_difference/mean": 0.015446479432284832, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 236.28125, "completions/mean_terminated_length": 236.28125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3663399815559387, "epoch": 0.7463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7660803450669008, "kl": 0.04421684145927429, "learning_rate": 9.349584302478144e-07, "loss": 0.0135, "num_tokens": 19380681.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3435853719711304, "sampling/importance_sampling_ratio/mean": 0.9998900294303894, "sampling/importance_sampling_ratio/min": 0.6743672490119934, "sampling/sampling_logp_difference/max": 0.3939805030822754, "sampling/sampling_logp_difference/mean": 0.013786327093839645, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 244.421875, "completions/mean_terminated_length": 244.421875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3699806034564972, "epoch": 0.7475490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.028794080921610312, "kl": 0.032179079949855804, "learning_rate": 9.346066445776321e-07, "loss": 0.0003, "num_tokens": 19414468.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5622196197509766, "sampling/importance_sampling_ratio/mean": 0.9998641014099121, "sampling/importance_sampling_ratio/min": 0.6900743246078491, "sampling/sampling_logp_difference/max": 0.4461076259613037, "sampling/sampling_logp_difference/mean": 0.012894706800580025, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 252.9375, "completions/mean_terminated_length": 252.9375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3648497760295868, "epoch": 0.7487745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03219960872218517, "kl": 0.04307214915752411, "learning_rate": 9.342539766834945e-07, "loss": 0.0004, "num_tokens": 19447536.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5771461725234985, "sampling/importance_sampling_ratio/mean": 1.0005178451538086, "sampling/importance_sampling_ratio/min": 0.6526509523391724, "sampling/sampling_logp_difference/max": 0.45561695098876953, "sampling/sampling_logp_difference/mean": 0.013854766264557838, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 288.46875, "completions/mean_terminated_length": 288.46875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3499077558517456, "epoch": 0.75, "frac_reward_zero_std": 1.0, "grad_norm": 0.02198641352607768, "kl": 0.029847459867596626, "learning_rate": 9.339004272812949e-07, "loss": 0.0003, "num_tokens": 19485646.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8180161714553833, "sampling/importance_sampling_ratio/mean": 0.9999338388442993, "sampling/importance_sampling_ratio/min": 0.4954237639904022, "sampling/sampling_logp_difference/max": 0.7023417949676514, "sampling/sampling_logp_difference/mean": 0.012782221660017967, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 187.453125, "completions/mean_terminated_length": 187.453125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.40029793977737427, "epoch": 0.7512254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.04016545177690515, "kl": 0.04481827840209007, "learning_rate": 9.335459970887165e-07, "loss": 0.0004, "num_tokens": 19514507.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.563207983970642, "sampling/importance_sampling_ratio/mean": 1.000853180885315, "sampling/importance_sampling_ratio/min": 0.625471830368042, "sampling/sampling_logp_difference/max": 0.46924901008605957, "sampling/sampling_logp_difference/mean": 0.0157247856259346, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 244.90625, "completions/mean_terminated_length": 244.90625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.38803213834762573, "epoch": 0.7524509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02340689613733759, "kl": 0.03086831234395504, "learning_rate": 9.331906868252299e-07, "loss": 0.0003, "num_tokens": 19551413.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3239327669143677, "sampling/importance_sampling_ratio/mean": 0.9999281764030457, "sampling/importance_sampling_ratio/min": 0.6368730068206787, "sampling/sampling_logp_difference/max": 0.4511849880218506, "sampling/sampling_logp_difference/mean": 0.014674468897283077, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 239.265625, "completions/mean_terminated_length": 239.265625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.39600038528442383, "epoch": 0.7536764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.028293929103437802, "kl": 0.03476922959089279, "learning_rate": 9.328344972120925e-07, "loss": 0.0003, "num_tokens": 19587126.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5374728441238403, "sampling/importance_sampling_ratio/mean": 1.000159502029419, "sampling/importance_sampling_ratio/min": 0.6482195258140564, "sampling/sampling_logp_difference/max": 0.43352580070495605, "sampling/sampling_logp_difference/mean": 0.014290915802121162, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 217.46875, "completions/mean_terminated_length": 217.46875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.41973811388015747, "epoch": 0.7549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9031670760037284, "kl": 0.042274974286556244, "learning_rate": 9.324774289723467e-07, "loss": -0.0065, "num_tokens": 19621748.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4025861024856567, "sampling/importance_sampling_ratio/mean": 1.0000026226043701, "sampling/importance_sampling_ratio/min": 0.6208490133285522, "sampling/sampling_logp_difference/max": 0.4766674041748047, "sampling/sampling_logp_difference/mean": 0.0144076282158494, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 243.609375, "completions/mean_terminated_length": 243.609375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.31403636932373047, "epoch": 0.7561274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.5682684804508696, "kl": 0.02878103218972683, "learning_rate": 9.321194828308183e-07, "loss": -0.0059, "num_tokens": 19651995.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3645097017288208, "sampling/importance_sampling_ratio/mean": 0.9994995594024658, "sampling/importance_sampling_ratio/min": 0.62545245885849, "sampling/sampling_logp_difference/max": 0.4692800045013428, "sampling/sampling_logp_difference/mean": 0.012476968578994274, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 216.34375, "completions/mean_terminated_length": 216.34375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4400501251220703, "epoch": 0.7573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.7444790506242778, "kl": 0.03834313154220581, "learning_rate": 9.317606595141155e-07, "loss": 0.0073, "num_tokens": 19683073.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5071570873260498, "sampling/importance_sampling_ratio/mean": 0.9999115467071533, "sampling/importance_sampling_ratio/min": 0.6173469424247742, "sampling/sampling_logp_difference/max": 0.48232412338256836, "sampling/sampling_logp_difference/mean": 0.017789751291275024, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 218.328125, "completions/mean_terminated_length": 218.328125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3517623543739319, "epoch": 0.758578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9659575956221583, "kl": 0.03380831331014633, "learning_rate": 9.314009597506265e-07, "loss": 0.0099, "num_tokens": 19710998.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6260311603546143, "sampling/importance_sampling_ratio/mean": 1.0004644393920898, "sampling/importance_sampling_ratio/min": 0.6954984068870544, "sampling/sampling_logp_difference/max": 0.4861421585083008, "sampling/sampling_logp_difference/mean": 0.014174041338264942, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 203.1875, "completions/mean_terminated_length": 203.1875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.41839948296546936, "epoch": 0.7598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.03355588255284209, "kl": 0.038350485265254974, "learning_rate": 9.310403842705194e-07, "loss": 0.0004, "num_tokens": 19740242.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3223214149475098, "sampling/importance_sampling_ratio/mean": 0.9994974136352539, "sampling/importance_sampling_ratio/min": 0.626263439655304, "sampling/sampling_logp_difference/max": 0.4679841995239258, "sampling/sampling_logp_difference/mean": 0.016148468479514122, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 253.9375, "completions/mean_terminated_length": 253.9375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4144206941127777, "epoch": 0.7610294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02585294348524462, "kl": 0.03346798196434975, "learning_rate": 9.306789338057393e-07, "loss": 0.0003, "num_tokens": 19777390.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4269086122512817, "sampling/importance_sampling_ratio/mean": 0.9993743896484375, "sampling/importance_sampling_ratio/min": 0.6089751124382019, "sampling/sampling_logp_difference/max": 0.49597787857055664, "sampling/sampling_logp_difference/mean": 0.01556416880339384, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 205.484375, "completions/mean_terminated_length": 205.484375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.35567742586135864, "epoch": 0.7622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02446956236989713, "kl": 0.030868055298924446, "learning_rate": 9.303166090900081e-07, "loss": 0.0003, "num_tokens": 19805069.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.482351303100586, "sampling/importance_sampling_ratio/mean": 1.0001112222671509, "sampling/importance_sampling_ratio/min": 0.6232286691665649, "sampling/sampling_logp_difference/max": 0.472841739654541, "sampling/sampling_logp_difference/mean": 0.015565713867545128, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 179.296875, "completions/mean_terminated_length": 179.296875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.37179112434387207, "epoch": 0.7634803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.026836675117688488, "kl": 0.03199257701635361, "learning_rate": 9.299534108588217e-07, "loss": 0.0003, "num_tokens": 19834064.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2945696115493774, "sampling/importance_sampling_ratio/mean": 1.0003085136413574, "sampling/importance_sampling_ratio/min": 0.7167701721191406, "sampling/sampling_logp_difference/max": 0.3330000638961792, "sampling/sampling_logp_difference/mean": 0.015221469104290009, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 193.78125, "completions/mean_terminated_length": 193.78125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3535550832748413, "epoch": 0.7647058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.021615991914011708, "kl": 0.026434477418661118, "learning_rate": 9.295893398494497e-07, "loss": 0.0003, "num_tokens": 19865090.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7394191026687622, "sampling/importance_sampling_ratio/mean": 1.0003294944763184, "sampling/importance_sampling_ratio/min": 0.381239652633667, "sampling/sampling_logp_difference/max": 0.9643270969390869, "sampling/sampling_logp_difference/mean": 0.014835287816822529, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 204.671875, "completions/mean_terminated_length": 204.671875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3836597204208374, "epoch": 0.7659313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.1315714816878374, "kl": 0.052863769233226776, "learning_rate": 9.29224396800933e-07, "loss": 0.0004, "num_tokens": 19894317.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.589585542678833, "sampling/importance_sampling_ratio/mean": 1.0002679824829102, "sampling/importance_sampling_ratio/min": 0.4982438087463379, "sampling/sampling_logp_difference/max": 0.6966657638549805, "sampling/sampling_logp_difference/mean": 0.01588052697479725, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 184.109375, "completions/mean_terminated_length": 184.109375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3631596863269806, "epoch": 0.7671568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.024232601747706098, "kl": 0.030641362071037292, "learning_rate": 9.288585824540832e-07, "loss": 0.0003, "num_tokens": 19925364.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3705593347549438, "sampling/importance_sampling_ratio/mean": 0.9995449185371399, "sampling/importance_sampling_ratio/min": 0.6803103685379028, "sampling/sampling_logp_difference/max": 0.38520610332489014, "sampling/sampling_logp_difference/mean": 0.016184469684958458, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 139.046875, "completions/mean_terminated_length": 139.046875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3215046525001526, "epoch": 0.7683823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.031171899339627742, "kl": 0.03496295213699341, "learning_rate": 9.284918975514797e-07, "loss": 0.0003, "num_tokens": 19948759.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3640942573547363, "sampling/importance_sampling_ratio/mean": 1.000285029411316, "sampling/importance_sampling_ratio/min": 0.6998546123504639, "sampling/sampling_logp_difference/max": 0.3568826913833618, "sampling/sampling_logp_difference/mean": 0.014979102648794651, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 209.171875, "completions/mean_terminated_length": 209.171875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3832654654979706, "epoch": 0.7696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.028730570751950475, "kl": 0.03670923411846161, "learning_rate": 9.281243428374701e-07, "loss": 0.0004, "num_tokens": 19975922.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5638998746871948, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.6141809225082397, "sampling/sampling_logp_difference/max": 0.4874657988548279, "sampling/sampling_logp_difference/mean": 0.015809854492545128, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 207.734375, "completions/mean_terminated_length": 207.734375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.34388265013694763, "epoch": 0.7708333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02739606455882557, "kl": 0.02548704668879509, "learning_rate": 9.277559190581669e-07, "loss": 0.0002, "num_tokens": 20013633.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5587172508239746, "sampling/importance_sampling_ratio/mean": 1.0006846189498901, "sampling/importance_sampling_ratio/min": 0.6293648481369019, "sampling/sampling_logp_difference/max": 0.4630441665649414, "sampling/sampling_logp_difference/mean": 0.014479327946901321, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 153.078125, "completions/mean_terminated_length": 153.078125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.38949495553970337, "epoch": 0.7720588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.02700349170585698, "kl": 0.0368114598095417, "learning_rate": 9.273866269614473e-07, "loss": 0.0004, "num_tokens": 20038246.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.358526349067688, "sampling/importance_sampling_ratio/mean": 0.9988189935684204, "sampling/importance_sampling_ratio/min": 0.6289469003677368, "sampling/sampling_logp_difference/max": 0.46370840072631836, "sampling/sampling_logp_difference/mean": 0.01636747270822525, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 179.71875, "completions/mean_terminated_length": 179.71875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4201076030731201, "epoch": 0.7732843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.029579892735116912, "kl": 0.033645324409008026, "learning_rate": 9.270164672969507e-07, "loss": 0.0003, "num_tokens": 20062724.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999145269393921, "sampling/importance_sampling_ratio/min": 0.643266499042511, "sampling/sampling_logp_difference/max": 0.7331724166870117, "sampling/sampling_logp_difference/mean": 0.016295362263917923, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 173.03125, "completions/mean_terminated_length": 173.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4026562571525574, "epoch": 0.7745098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.026486712340398812, "kl": 0.0320625826716423, "learning_rate": 9.266454408160777e-07, "loss": 0.0003, "num_tokens": 20090422.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5595238208770752, "sampling/importance_sampling_ratio/mean": 1.0004431009292603, "sampling/importance_sampling_ratio/min": 0.6771566867828369, "sampling/sampling_logp_difference/max": 0.444380521774292, "sampling/sampling_logp_difference/mean": 0.016752826049923897, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 150.65625, "completions/mean_terminated_length": 150.65625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2996982932090759, "epoch": 0.7757352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.03653067439866391, "kl": 0.036317117512226105, "learning_rate": 9.262735482719887e-07, "loss": 0.0004, "num_tokens": 20113648.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5977691411972046, "sampling/importance_sampling_ratio/mean": 1.0001448392868042, "sampling/importance_sampling_ratio/min": 0.6234261989593506, "sampling/sampling_logp_difference/max": 0.47252488136291504, "sampling/sampling_logp_difference/mean": 0.015517426654696465, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3743230998516083, "epoch": 0.7769607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.027387618909479224, "kl": 0.03293408080935478, "learning_rate": 9.259007904196021e-07, "loss": 0.0003, "num_tokens": 20144928.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4939587116241455, "sampling/importance_sampling_ratio/mean": 1.000123143196106, "sampling/importance_sampling_ratio/min": 0.6464174389839172, "sampling/sampling_logp_difference/max": 0.436309814453125, "sampling/sampling_logp_difference/mean": 0.016843587160110474, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 226.59375, "completions/mean_terminated_length": 226.59375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.4369206428527832, "epoch": 0.7781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.8060044236768866, "kl": 0.03182988241314888, "learning_rate": 9.255271680155923e-07, "loss": 0.0117, "num_tokens": 20178966.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.445049524307251, "sampling/importance_sampling_ratio/mean": 1.0002126693725586, "sampling/importance_sampling_ratio/min": 0.6056285500526428, "sampling/sampling_logp_difference/max": 0.501488447189331, "sampling/sampling_logp_difference/mean": 0.016117552295327187, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 202.328125, "completions/mean_terminated_length": 202.328125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.339857280254364, "epoch": 0.7794117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.02036401823812101, "kl": 0.021871455013751984, "learning_rate": 9.251526818183896e-07, "loss": 0.0002, "num_tokens": 20213131.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5074704885482788, "sampling/importance_sampling_ratio/mean": 1.0000642538070679, "sampling/importance_sampling_ratio/min": 0.6117793917655945, "sampling/sampling_logp_difference/max": 0.49138355255126953, "sampling/sampling_logp_difference/mean": 0.014778957702219486, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 181.734375, "completions/mean_terminated_length": 181.734375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.31528449058532715, "epoch": 0.7806372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.02285719484122413, "kl": 0.026205450296401978, "learning_rate": 9.247773325881769e-07, "loss": 0.0003, "num_tokens": 20240298.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5207531452178955, "sampling/importance_sampling_ratio/mean": 1.0005592107772827, "sampling/importance_sampling_ratio/min": 0.6264617443084717, "sampling/sampling_logp_difference/max": 0.4676675796508789, "sampling/sampling_logp_difference/mean": 0.013754029758274555, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 189.734375, "completions/mean_terminated_length": 189.734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3886377811431885, "epoch": 0.7818627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.8941321714645836, "kl": 0.03759963810443878, "learning_rate": 9.244011210868895e-07, "loss": -0.012, "num_tokens": 20270441.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6039882898330688, "sampling/importance_sampling_ratio/mean": 1.0004351139068604, "sampling/importance_sampling_ratio/min": 0.5586837530136108, "sampling/sampling_logp_difference/max": 0.5821716785430908, "sampling/sampling_logp_difference/mean": 0.016681428998708725, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 199.15625, "completions/mean_terminated_length": 199.15625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30207303166389465, "epoch": 0.7830882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.02134380872013604, "kl": 0.02383604645729065, "learning_rate": 9.240240480782129e-07, "loss": 0.0002, "num_tokens": 20299667.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8481186628341675, "sampling/importance_sampling_ratio/mean": 0.9998188614845276, "sampling/importance_sampling_ratio/min": 0.5087823867797852, "sampling/sampling_logp_difference/max": 0.6757348775863647, "sampling/sampling_logp_difference/mean": 0.014480200596153736, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 194.59375, "completions/mean_terminated_length": 194.59375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3990108370780945, "epoch": 0.7843137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.026674649992018927, "kl": 0.03336282819509506, "learning_rate": 9.236461143275815e-07, "loss": 0.0003, "num_tokens": 20332089.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6365165710449219, "sampling/importance_sampling_ratio/mean": 0.9999001622200012, "sampling/importance_sampling_ratio/min": 0.5185440182685852, "sampling/sampling_logp_difference/max": 0.6567304134368896, "sampling/sampling_logp_difference/mean": 0.01609973981976509, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 184.828125, "completions/mean_terminated_length": 184.828125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.37371888756752014, "epoch": 0.7855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.9130812228838805, "kl": 0.030252164229750633, "learning_rate": 9.232673206021767e-07, "loss": -0.0069, "num_tokens": 20358302.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.471134066581726, "sampling/importance_sampling_ratio/mean": 0.9999719262123108, "sampling/importance_sampling_ratio/min": 0.60378497838974, "sampling/sampling_logp_difference/max": 0.5045371055603027, "sampling/sampling_logp_difference/mean": 0.01553965825587511, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 184.4375, "completions/mean_terminated_length": 184.4375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3092377483844757, "epoch": 0.7867647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 1.0526300537508844, "kl": 0.028306782245635986, "learning_rate": 9.228876676709259e-07, "loss": -0.0171, "num_tokens": 20385738.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.44679594039917, "sampling/importance_sampling_ratio/mean": 1.0000028610229492, "sampling/importance_sampling_ratio/min": 0.6616688370704651, "sampling/sampling_logp_difference/max": 0.41299009323120117, "sampling/sampling_logp_difference/mean": 0.01314954087138176, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3816605806350708, "epoch": 0.7879901960784313, "frac_reward_zero_std": 0.75, "grad_norm": 1.0318468385581494, "kl": 0.021000273525714874, "learning_rate": 9.225071563045006e-07, "loss": -0.0395, "num_tokens": 20414258.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5868381261825562, "sampling/importance_sampling_ratio/mean": 0.9999417066574097, "sampling/importance_sampling_ratio/min": 0.620795726776123, "sampling/sampling_logp_difference/max": 0.47675323486328125, "sampling/sampling_logp_difference/mean": 0.015572399832308292, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 172.328125, "completions/mean_terminated_length": 172.328125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3673475384712219, "epoch": 0.7892156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.02231174943566149, "kl": 0.026067199185490608, "learning_rate": 9.221257872753144e-07, "loss": 0.0003, "num_tokens": 20442167.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4737712144851685, "sampling/importance_sampling_ratio/mean": 0.9999111890792847, "sampling/importance_sampling_ratio/min": 0.6994224190711975, "sampling/sampling_logp_difference/max": 0.38782453536987305, "sampling/sampling_logp_difference/mean": 0.015629781410098076, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 142.546875, "completions/mean_terminated_length": 142.546875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3521348536014557, "epoch": 0.7904411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 0.9623338309282699, "kl": 0.03117131069302559, "learning_rate": 9.217435613575226e-07, "loss": 0.0133, "num_tokens": 20466522.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6464656591415405, "sampling/importance_sampling_ratio/mean": 0.9999923706054688, "sampling/importance_sampling_ratio/min": 0.6097986102104187, "sampling/sampling_logp_difference/max": 0.49863100051879883, "sampling/sampling_logp_difference/mean": 0.01695406809449196, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3095352053642273, "epoch": 0.7916666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.02055465809870928, "kl": 0.027063611894845963, "learning_rate": 9.213604793270196e-07, "loss": 0.0003, "num_tokens": 20491782.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.550699234008789, "sampling/importance_sampling_ratio/mean": 0.9997790455818176, "sampling/importance_sampling_ratio/min": 0.6032088994979858, "sampling/sampling_logp_difference/max": 0.5054917335510254, "sampling/sampling_logp_difference/mean": 0.014584427699446678, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 204.4375, "completions/mean_terminated_length": 204.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3655603528022766, "epoch": 0.7928921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9800069770800625, "kl": 0.02848714590072632, "learning_rate": 9.209765419614373e-07, "loss": 0.0286, "num_tokens": 20519778.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5460152626037598, "sampling/importance_sampling_ratio/mean": 0.999433159828186, "sampling/importance_sampling_ratio/min": 0.6922650933265686, "sampling/sampling_logp_difference/max": 0.4356808662414551, "sampling/sampling_logp_difference/mean": 0.015107221901416779, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 190.421875, "completions/mean_terminated_length": 190.421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3832641541957855, "epoch": 0.7941176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01899884698364976, "kl": 0.02495635487139225, "learning_rate": 9.205917500401447e-07, "loss": 0.0002, "num_tokens": 20550573.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3351675271987915, "sampling/importance_sampling_ratio/mean": 0.9992302060127258, "sampling/importance_sampling_ratio/min": 0.5054485201835632, "sampling/sampling_logp_difference/max": 0.6823091506958008, "sampling/sampling_logp_difference/mean": 0.014901909045875072, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.39816907048225403, "epoch": 0.7953431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 0.928661629774423, "kl": 0.02907564491033554, "learning_rate": 9.202061043442447e-07, "loss": 0.006, "num_tokens": 20577269.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5745267868041992, "sampling/importance_sampling_ratio/mean": 1.0006103515625, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.45395469665527344, "sampling/sampling_logp_difference/mean": 0.01689119264483452, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 195.828125, "completions/mean_terminated_length": 195.828125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.43053948879241943, "epoch": 0.7965686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.02537971558990522, "kl": 0.02950497530400753, "learning_rate": 9.198196056565738e-07, "loss": 0.0003, "num_tokens": 20608714.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.575938105583191, "sampling/importance_sampling_ratio/mean": 0.9997596740722656, "sampling/importance_sampling_ratio/min": 0.6518038511276245, "sampling/sampling_logp_difference/max": 0.4548506736755371, "sampling/sampling_logp_difference/mean": 0.017002377659082413, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 224.90625, "completions/mean_terminated_length": 224.90625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3836837410926819, "epoch": 0.7977941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.8304670957527277, "kl": 0.0255807526409626, "learning_rate": 9.194322547616997e-07, "loss": 0.0046, "num_tokens": 20641076.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4579864740371704, "sampling/importance_sampling_ratio/mean": 0.999410092830658, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.014957036823034286, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 218.265625, "completions/mean_terminated_length": 218.265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.42147013545036316, "epoch": 0.7990196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.0434532236433227, "kl": 0.034579478204250336, "learning_rate": 9.190440524459202e-07, "loss": 0.0209, "num_tokens": 20675813.0, "reward": 0.625, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.9799221754074097, "sampling/importance_sampling_ratio/mean": 1.0001271963119507, "sampling/importance_sampling_ratio/min": 0.5352949500083923, "sampling/sampling_logp_difference/max": 0.6830575466156006, "sampling/sampling_logp_difference/mean": 0.015538867563009262, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 242.40625, "completions/mean_terminated_length": 242.40625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3450837731361389, "epoch": 0.8002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03538963112994367, "kl": 0.02714606374502182, "learning_rate": 9.186549994972616e-07, "loss": 0.0003, "num_tokens": 20710559.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.507158875465393, "sampling/importance_sampling_ratio/mean": 0.9997929334640503, "sampling/importance_sampling_ratio/min": 0.49667325615882874, "sampling/sampling_logp_difference/max": 0.6998229026794434, "sampling/sampling_logp_difference/mean": 0.013135567307472229, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 210.640625, "completions/mean_terminated_length": 210.640625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.41749826073646545, "epoch": 0.8014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.1118625188874587, "kl": 0.0379713773727417, "learning_rate": 9.182650967054766e-07, "loss": 0.0278, "num_tokens": 20743672.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6087944507598877, "sampling/importance_sampling_ratio/mean": 0.9991387128829956, "sampling/importance_sampling_ratio/min": 0.633405327796936, "sampling/sampling_logp_difference/max": 0.47548508644104004, "sampling/sampling_logp_difference/mean": 0.015086393803358078, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 228.484375, "completions/mean_terminated_length": 228.484375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4391304850578308, "epoch": 0.8026960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.6970682353865376, "kl": 0.034562766551971436, "learning_rate": 9.178743448620431e-07, "loss": -0.0008, "num_tokens": 20775783.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6267740726470947, "sampling/importance_sampling_ratio/mean": 1.0002880096435547, "sampling/importance_sampling_ratio/min": 0.6578302383422852, "sampling/sampling_logp_difference/max": 0.4865989685058594, "sampling/sampling_logp_difference/mean": 0.016575045883655548, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 223.140625, "completions/mean_terminated_length": 223.140625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3668602705001831, "epoch": 0.803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7347430934219253, "kl": 0.025888238102197647, "learning_rate": 9.174827447601627e-07, "loss": -0.0145, "num_tokens": 20806064.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.620834231376648, "sampling/importance_sampling_ratio/mean": 0.9997379183769226, "sampling/importance_sampling_ratio/min": 0.6337375044822693, "sampling/sampling_logp_difference/max": 0.4829409122467041, "sampling/sampling_logp_difference/mean": 0.013868054375052452, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.37275058031082153, "epoch": 0.8051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1062569662147859, "kl": 0.02562744729220867, "learning_rate": 9.170902971947588e-07, "loss": 0.0227, "num_tokens": 20841632.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6554718017578125, "sampling/importance_sampling_ratio/mean": 1.0002436637878418, "sampling/importance_sampling_ratio/min": 0.6547819972038269, "sampling/sampling_logp_difference/max": 0.5040860176086426, "sampling/sampling_logp_difference/mean": 0.01474856398999691, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 178.78125, "completions/mean_terminated_length": 178.78125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30432000756263733, "epoch": 0.8063725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.02551123762861861, "kl": 0.02973772957921028, "learning_rate": 9.166970029624749e-07, "loss": 0.0003, "num_tokens": 20867026.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.465565800666809, "sampling/importance_sampling_ratio/mean": 0.9999655485153198, "sampling/importance_sampling_ratio/min": 0.7103471755981445, "sampling/sampling_logp_difference/max": 0.3822413682937622, "sampling/sampling_logp_difference/mean": 0.01424277201294899, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 192.328125, "completions/mean_terminated_length": 192.328125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.30770403146743774, "epoch": 0.8075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02144686647787804, "kl": 0.026862647384405136, "learning_rate": 9.163028628616738e-07, "loss": 0.0003, "num_tokens": 20897511.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.420772910118103, "sampling/importance_sampling_ratio/mean": 0.9997912645339966, "sampling/importance_sampling_ratio/min": 0.6944699287414551, "sampling/sampling_logp_difference/max": 0.3646063804626465, "sampling/sampling_logp_difference/mean": 0.012521122582256794, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 233.71875, "completions/mean_terminated_length": 233.71875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4509712755680084, "epoch": 0.8088235294117647, "frac_reward_zero_std": 0.5, "grad_norm": 1.276355707768248, "kl": 0.0320962592959404, "learning_rate": 9.159078776924345e-07, "loss": -0.0953, "num_tokens": 20930277.0, "reward": -0.28125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.624848484992981, "sampling/importance_sampling_ratio/mean": 1.0003355741500854, "sampling/importance_sampling_ratio/min": 0.612565815448761, "sampling/sampling_logp_difference/max": 0.4900989532470703, "sampling/sampling_logp_difference/mean": 0.01768365502357483, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 246.140625, "completions/mean_terminated_length": 246.140625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.31037211418151855, "epoch": 0.8100490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.665667870226732, "kl": 0.02321113646030426, "learning_rate": 9.155120482565519e-07, "loss": 0.0108, "num_tokens": 20963422.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3408809900283813, "sampling/importance_sampling_ratio/mean": 1.000077247619629, "sampling/importance_sampling_ratio/min": 0.7097097039222717, "sampling/sampling_logp_difference/max": 0.3428993225097656, "sampling/sampling_logp_difference/mean": 0.011591589078307152, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 217.015625, "completions/mean_terminated_length": 217.015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3376811742782593, "epoch": 0.8112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.027645075247131553, "kl": 0.03416694700717926, "learning_rate": 9.15115375357535e-07, "loss": 0.0003, "num_tokens": 20992127.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6521793603897095, "sampling/importance_sampling_ratio/mean": 0.9999262690544128, "sampling/importance_sampling_ratio/min": 0.6254516839981079, "sampling/sampling_logp_difference/max": 0.5020952224731445, "sampling/sampling_logp_difference/mean": 0.014627622440457344, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 229.90625, "completions/mean_terminated_length": 229.90625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3859865069389343, "epoch": 0.8125, "frac_reward_zero_std": 1.0, "grad_norm": 0.034558263862194155, "kl": 0.04741872847080231, "learning_rate": 9.147178598006044e-07, "loss": 0.0004, "num_tokens": 21022841.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003008842468262, "sampling/importance_sampling_ratio/min": 0.4793369174003601, "sampling/sampling_logp_difference/max": 0.7353515625, "sampling/sampling_logp_difference/mean": 0.014258160255849361, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 211.359375, "completions/mean_terminated_length": 211.359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.42058265209198, "epoch": 0.8137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.7761319641351087, "kl": 0.042282477021217346, "learning_rate": 9.143195023926917e-07, "loss": -0.0085, "num_tokens": 21050736.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.327823281288147, "sampling/importance_sampling_ratio/mean": 0.9997660517692566, "sampling/importance_sampling_ratio/min": 0.49541500210762024, "sampling/sampling_logp_difference/max": 0.7023594379425049, "sampling/sampling_logp_difference/mean": 0.016381915658712387, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.4263555407524109, "epoch": 0.8149509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.68782348424507, "kl": 0.02612464874982834, "learning_rate": 9.139203039424368e-07, "loss": 0.009, "num_tokens": 21087472.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.374288558959961, "sampling/importance_sampling_ratio/mean": 1.0000532865524292, "sampling/importance_sampling_ratio/min": 0.5362951159477234, "sampling/sampling_logp_difference/max": 0.6230707168579102, "sampling/sampling_logp_difference/mean": 0.01435888186097145, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 210.984375, "completions/mean_terminated_length": 210.984375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.45306310057640076, "epoch": 0.8161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7615476265988077, "kl": 0.04164658114314079, "learning_rate": 9.135202652601876e-07, "loss": -0.0096, "num_tokens": 21116751.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6147634983062744, "sampling/importance_sampling_ratio/mean": 0.9993770122528076, "sampling/importance_sampling_ratio/min": 0.6319296360015869, "sampling/sampling_logp_difference/max": 0.4791884422302246, "sampling/sampling_logp_difference/mean": 0.017668476328253746, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 269.671875, "completions/mean_terminated_length": 269.671875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3904714584350586, "epoch": 0.8174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7452772038267703, "kl": 0.028026167303323746, "learning_rate": 9.131193871579974e-07, "loss": -0.0206, "num_tokens": 21161850.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9828075170516968, "sampling/importance_sampling_ratio/mean": 1.0001330375671387, "sampling/importance_sampling_ratio/min": 0.55345618724823, "sampling/sampling_logp_difference/max": 0.6845138072967529, "sampling/sampling_logp_difference/mean": 0.015064573846757412, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.40476080775260925, "epoch": 0.8186274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.032451223748506355, "kl": 0.04005538672208786, "learning_rate": 9.127176704496231e-07, "loss": 0.0004, "num_tokens": 21196226.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5751492977142334, "sampling/importance_sampling_ratio/mean": 1.0002100467681885, "sampling/importance_sampling_ratio/min": 0.70793217420578, "sampling/sampling_logp_difference/max": 0.45434999465942383, "sampling/sampling_logp_difference/mean": 0.016093598678708076, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 242.609375, "completions/mean_terminated_length": 242.609375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.41240131855010986, "epoch": 0.8198529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0744136939825115, "kl": 0.0493934229016304, "learning_rate": 9.123151159505241e-07, "loss": 0.0095, "num_tokens": 21225465.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5759365558624268, "sampling/importance_sampling_ratio/mean": 0.9999208450317383, "sampling/importance_sampling_ratio/min": 0.7214313745498657, "sampling/sampling_logp_difference/max": 0.4548497200012207, "sampling/sampling_logp_difference/mean": 0.015428552404046059, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 215.765625, "completions/mean_terminated_length": 215.765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3737298250198364, "epoch": 0.821078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.032785380224256894, "kl": 0.0349334180355072, "learning_rate": 9.119117244778607e-07, "loss": 0.0004, "num_tokens": 21259098.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 1.0001604557037354, "sampling/importance_sampling_ratio/min": 0.5914656519889832, "sampling/sampling_logp_difference/max": 0.5251517295837402, "sampling/sampling_logp_difference/mean": 0.014463303610682487, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 208.890625, "completions/mean_terminated_length": 208.890625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3668181896209717, "epoch": 0.8223039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.030875322481958026, "kl": 0.03284909576177597, "learning_rate": 9.115074968504921e-07, "loss": 0.0003, "num_tokens": 21294099.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4577935934066772, "sampling/importance_sampling_ratio/mean": 1.0002777576446533, "sampling/importance_sampling_ratio/min": 0.6196385025978088, "sampling/sampling_logp_difference/max": 0.4786190986633301, "sampling/sampling_logp_difference/mean": 0.014896114356815815, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 208.765625, "completions/mean_terminated_length": 208.765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.33424144983291626, "epoch": 0.8235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.024422238394924658, "kl": 0.025998327881097794, "learning_rate": 9.111024338889746e-07, "loss": 0.0003, "num_tokens": 21321604.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6071261167526245, "sampling/importance_sampling_ratio/mean": 0.9993447065353394, "sampling/importance_sampling_ratio/min": 0.6622374653816223, "sampling/sampling_logp_difference/max": 0.47444748878479004, "sampling/sampling_logp_difference/mean": 0.014403936453163624, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 189.578125, "completions/mean_terminated_length": 189.578125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3311287760734558, "epoch": 0.8247549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02488256089413651, "kl": 0.027755137532949448, "learning_rate": 9.106965364155605e-07, "loss": 0.0003, "num_tokens": 21353049.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6572978496551514, "sampling/importance_sampling_ratio/mean": 0.9994744062423706, "sampling/importance_sampling_ratio/min": 0.6177636384963989, "sampling/sampling_logp_difference/max": 0.5051884651184082, "sampling/sampling_logp_difference/mean": 0.014555609785020351, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 190.515625, "completions/mean_terminated_length": 190.515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.30811795592308044, "epoch": 0.8259803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.03174957866503794, "kl": 0.03279242664575577, "learning_rate": 9.102898052541957e-07, "loss": 0.0003, "num_tokens": 21385930.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4735252857208252, "sampling/importance_sampling_ratio/mean": 1.0001640319824219, "sampling/importance_sampling_ratio/min": 0.5007209181785583, "sampling/sampling_logp_difference/max": 0.6917064189910889, "sampling/sampling_logp_difference/mean": 0.014363166876137257, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 200.921875, "completions/mean_terminated_length": 200.921875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.25038421154022217, "epoch": 0.8272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02248365752820782, "kl": 0.025730226188898087, "learning_rate": 9.09882241230519e-07, "loss": 0.0002, "num_tokens": 21414149.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6639342308044434, "sampling/importance_sampling_ratio/mean": 0.9999651908874512, "sampling/importance_sampling_ratio/min": 0.6247349381446838, "sampling/sampling_logp_difference/max": 0.5091848373413086, "sampling/sampling_logp_difference/mean": 0.012042131274938583, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 238.90625, "completions/mean_terminated_length": 238.90625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.31127092242240906, "epoch": 0.8284313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.024899186794190512, "kl": 0.024371564388275146, "learning_rate": 9.094738451718593e-07, "loss": 0.0002, "num_tokens": 21445503.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5279085636138916, "sampling/importance_sampling_ratio/mean": 1.0001455545425415, "sampling/importance_sampling_ratio/min": 0.6630603075027466, "sampling/sampling_logp_difference/max": 0.42389988899230957, "sampling/sampling_logp_difference/mean": 0.01330435648560524, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 190.328125, "completions/mean_terminated_length": 190.328125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.31943953037261963, "epoch": 0.8296568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0860904211449993, "kl": 0.04268050193786621, "learning_rate": 9.09064617907235e-07, "loss": 0.0004, "num_tokens": 21471812.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6025196313858032, "sampling/importance_sampling_ratio/mean": 0.9998210072517395, "sampling/importance_sampling_ratio/min": 0.6037432551383972, "sampling/sampling_logp_difference/max": 0.5046062469482422, "sampling/sampling_logp_difference/mean": 0.01423419825732708, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 186.328125, "completions/mean_terminated_length": 186.328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2529739737510681, "epoch": 0.8308823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0462886497662702, "kl": 0.03442566841840744, "learning_rate": 9.086545602673513e-07, "loss": 0.0003, "num_tokens": 21498521.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5877935886383057, "sampling/importance_sampling_ratio/mean": 0.999727725982666, "sampling/importance_sampling_ratio/min": 0.6185637712478638, "sampling/sampling_logp_difference/max": 0.48035502433776855, "sampling/sampling_logp_difference/mean": 0.012091949582099915, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 246.328125, "completions/mean_terminated_length": 246.328125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3578718602657318, "epoch": 0.8321078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.02742302173376564, "kl": 0.03098582848906517, "learning_rate": 9.082436730845993e-07, "loss": 0.0003, "num_tokens": 21532062.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9995269775390625, "sampling/importance_sampling_ratio/min": 0.5791468024253845, "sampling/sampling_logp_difference/max": 0.5461993217468262, "sampling/sampling_logp_difference/mean": 0.01504305936396122, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 222.203125, "completions/mean_terminated_length": 222.203125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.329847127199173, "epoch": 0.8333333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.022430099774986383, "kl": 0.02566959336400032, "learning_rate": 9.07831957193054e-07, "loss": 0.0002, "num_tokens": 21567995.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.547278642654419, "sampling/importance_sampling_ratio/mean": 0.9995587468147278, "sampling/importance_sampling_ratio/min": 0.6056252121925354, "sampling/sampling_logp_difference/max": 0.5014939308166504, "sampling/sampling_logp_difference/mean": 0.013780951499938965, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 184.859375, "completions/mean_terminated_length": 184.859375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.36886081099510193, "epoch": 0.8345588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.030786434624787917, "kl": 0.02804858796298504, "learning_rate": 9.074194134284725e-07, "loss": 0.0003, "num_tokens": 21598322.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4174898862838745, "sampling/importance_sampling_ratio/mean": 0.9999595880508423, "sampling/importance_sampling_ratio/min": 0.6053468585014343, "sampling/sampling_logp_difference/max": 0.5019536018371582, "sampling/sampling_logp_difference/mean": 0.016476290300488472, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 222.90625, "completions/mean_terminated_length": 222.90625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.40505316853523254, "epoch": 0.8357843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9681077769908948, "kl": 0.03483361750841141, "learning_rate": 9.070060426282924e-07, "loss": 0.0068, "num_tokens": 21631676.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.464301347732544, "sampling/importance_sampling_ratio/mean": 0.9999438524246216, "sampling/importance_sampling_ratio/min": 0.6095887422561646, "sampling/sampling_logp_difference/max": 0.49497079849243164, "sampling/sampling_logp_difference/mean": 0.0161592997610569, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 218.234375, "completions/mean_terminated_length": 218.234375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3744795322418213, "epoch": 0.8370098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.027247538606273457, "kl": 0.02873339131474495, "learning_rate": 9.065918456316303e-07, "loss": 0.0003, "num_tokens": 21660699.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5562453269958496, "sampling/importance_sampling_ratio/mean": 0.9999843835830688, "sampling/importance_sampling_ratio/min": 0.6183509826660156, "sampling/sampling_logp_difference/max": 0.4806990623474121, "sampling/sampling_logp_difference/mean": 0.015885494649410248, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 225.359375, "completions/mean_terminated_length": 225.359375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.31745287775993347, "epoch": 0.8382352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01958112477160249, "kl": 0.024302463978528976, "learning_rate": 9.061768232792802e-07, "loss": 0.0002, "num_tokens": 21700386.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.608885407447815, "sampling/importance_sampling_ratio/mean": 0.9997677803039551, "sampling/importance_sampling_ratio/min": 0.49139276146888733, "sampling/sampling_logp_difference/max": 0.7105115652084351, "sampling/sampling_logp_difference/mean": 0.013085578568279743, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3961857855319977, "epoch": 0.8394607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.8216536330980322, "kl": 0.036479175090789795, "learning_rate": 9.057609764137109e-07, "loss": 0.0295, "num_tokens": 21733874.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5302457809448242, "sampling/importance_sampling_ratio/mean": 0.9998958110809326, "sampling/importance_sampling_ratio/min": 0.6393280625343323, "sampling/sampling_logp_difference/max": 0.44733762741088867, "sampling/sampling_logp_difference/mean": 0.015596205368638039, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 238.765625, "completions/mean_terminated_length": 238.765625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4261338710784912, "epoch": 0.8406862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.8447611894538987, "kl": 0.03494609519839287, "learning_rate": 9.053443058790651e-07, "loss": 0.0277, "num_tokens": 21768451.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3671236038208008, "sampling/importance_sampling_ratio/mean": 0.9995797872543335, "sampling/importance_sampling_ratio/min": 0.6059609055519104, "sampling/sampling_logp_difference/max": 0.5009398460388184, "sampling/sampling_logp_difference/mean": 0.016068704426288605, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4047449827194214, "epoch": 0.8419117647058824, "frac_reward_zero_std": 0.75, "grad_norm": 0.8108837008270493, "kl": 0.03250086307525635, "learning_rate": 9.049268125211575e-07, "loss": 0.003, "num_tokens": 21799091.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5083317756652832, "sampling/importance_sampling_ratio/mean": 0.9995611906051636, "sampling/importance_sampling_ratio/min": 0.6117785573005676, "sampling/sampling_logp_difference/max": 0.49138498306274414, "sampling/sampling_logp_difference/mean": 0.015563691966235638, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 162.953125, "completions/mean_terminated_length": 162.953125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2634103298187256, "epoch": 0.8431372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.03276185420745991, "kl": 0.034213632345199585, "learning_rate": 9.045084971874737e-07, "loss": 0.0003, "num_tokens": 21826848.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.654000997543335, "sampling/importance_sampling_ratio/mean": 1.000110149383545, "sampling/importance_sampling_ratio/min": 0.5676656365394592, "sampling/sampling_logp_difference/max": 0.5662226676940918, "sampling/sampling_logp_difference/mean": 0.013691332191228867, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 181.28125, "completions/mean_terminated_length": 181.28125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3478059768676758, "epoch": 0.8443627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.029273529644389087, "kl": 0.03276558965444565, "learning_rate": 9.040893607271668e-07, "loss": 0.0003, "num_tokens": 21863346.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6288442611694336, "sampling/importance_sampling_ratio/mean": 1.0005204677581787, "sampling/importance_sampling_ratio/min": 0.6104415655136108, "sampling/sampling_logp_difference/max": 0.4935727119445801, "sampling/sampling_logp_difference/mean": 0.015371044166386127, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 197.90625, "completions/mean_terminated_length": 197.90625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3709610402584076, "epoch": 0.8455882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.023464229306227536, "kl": 0.030765000730752945, "learning_rate": 9.036694039910576e-07, "loss": 0.0003, "num_tokens": 21891996.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.613331913948059, "sampling/importance_sampling_ratio/mean": 0.9996424913406372, "sampling/importance_sampling_ratio/min": 0.6932787895202637, "sampling/sampling_logp_difference/max": 0.4783015251159668, "sampling/sampling_logp_difference/mean": 0.01533258706331253, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 196.171875, "completions/mean_terminated_length": 196.171875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.38342106342315674, "epoch": 0.8468137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.03526812413441555, "kl": 0.03387913852930069, "learning_rate": 9.032486278316313e-07, "loss": 0.0003, "num_tokens": 21922263.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.413758635520935, "sampling/importance_sampling_ratio/mean": 0.9994491934776306, "sampling/importance_sampling_ratio/min": 0.6389612555503845, "sampling/sampling_logp_difference/max": 0.44791150093078613, "sampling/sampling_logp_difference/mean": 0.015761565417051315, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 192.84375, "completions/mean_terminated_length": 192.84375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3467373251914978, "epoch": 0.8480392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.03485957554369687, "kl": 0.03310055658221245, "learning_rate": 9.028270331030372e-07, "loss": 0.0003, "num_tokens": 21952701.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9998960494995117, "sampling/importance_sampling_ratio/min": 0.6819144487380981, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.014420264400541782, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 167.390625, "completions/mean_terminated_length": 167.390625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3890190124511719, "epoch": 0.8492647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.041594942453069, "kl": 0.03580023720860481, "learning_rate": 9.024046206610857e-07, "loss": 0.0004, "num_tokens": 21982326.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4016826152801514, "sampling/importance_sampling_ratio/mean": 0.9994091987609863, "sampling/importance_sampling_ratio/min": 0.650191068649292, "sampling/sampling_logp_difference/max": 0.43048906326293945, "sampling/sampling_logp_difference/mean": 0.016755372285842896, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 191.515625, "completions/mean_terminated_length": 191.515625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3270426392555237, "epoch": 0.8504901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.022579528594023877, "kl": 0.03228536248207092, "learning_rate": 9.019813913632475e-07, "loss": 0.0003, "num_tokens": 22012471.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4279792308807373, "sampling/importance_sampling_ratio/mean": 0.9996547698974609, "sampling/importance_sampling_ratio/min": 0.7065871953964233, "sampling/sampling_logp_difference/max": 0.3562602996826172, "sampling/sampling_logp_difference/mean": 0.015847191214561462, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 142.71875, "completions/mean_terminated_length": 142.71875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.40136900544166565, "epoch": 0.8517156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.03651747452103777, "kl": 0.044040828943252563, "learning_rate": 9.015573460686509e-07, "loss": 0.0004, "num_tokens": 22038149.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7007126808166504, "sampling/importance_sampling_ratio/mean": 1.0002636909484863, "sampling/importance_sampling_ratio/min": 0.5775176286697388, "sampling/sampling_logp_difference/max": 0.5490162372589111, "sampling/sampling_logp_difference/mean": 0.01701734960079193, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 151.203125, "completions/mean_terminated_length": 151.203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.39290058612823486, "epoch": 0.8529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.04166512395127129, "kl": 0.04310157150030136, "learning_rate": 9.011324856380813e-07, "loss": 0.0004, "num_tokens": 22064770.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6670409440994263, "sampling/importance_sampling_ratio/mean": 0.9995830059051514, "sampling/importance_sampling_ratio/min": 0.6389588117599487, "sampling/sampling_logp_difference/max": 0.5110502243041992, "sampling/sampling_logp_difference/mean": 0.01720668561756611, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 209.71875, "completions/mean_terminated_length": 209.71875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4242228865623474, "epoch": 0.8541666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.021517881330219764, "kl": 0.02575260028243065, "learning_rate": 9.007068109339783e-07, "loss": 0.0003, "num_tokens": 22097184.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9997817277908325, "sampling/importance_sampling_ratio/min": 0.6130290627479553, "sampling/sampling_logp_difference/max": 0.48934292793273926, "sampling/sampling_logp_difference/mean": 0.015529869124293327, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 172.390625, "completions/mean_terminated_length": 172.390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3806571960449219, "epoch": 0.8553921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.049945500053586875, "kl": 0.03458796441555023, "learning_rate": 9.002803228204348e-07, "loss": 0.0003, "num_tokens": 22127657.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.649056315422058, "sampling/importance_sampling_ratio/mean": 0.9996928572654724, "sampling/importance_sampling_ratio/min": 0.6011093854904175, "sampling/sampling_logp_difference/max": 0.5089783668518066, "sampling/sampling_logp_difference/mean": 0.015567264519631863, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 192.421875, "completions/mean_terminated_length": 192.421875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4347988963127136, "epoch": 0.8566176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.030727108984442423, "kl": 0.0344550758600235, "learning_rate": 8.998530221631941e-07, "loss": 0.0003, "num_tokens": 22160020.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3363691568374634, "sampling/importance_sampling_ratio/mean": 0.9999052286148071, "sampling/importance_sampling_ratio/min": 0.670637309551239, "sampling/sampling_logp_difference/max": 0.39952683448791504, "sampling/sampling_logp_difference/mean": 0.0158841609954834, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 219.109375, "completions/mean_terminated_length": 219.109375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4007618725299835, "epoch": 0.8578431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.01855108143989355, "kl": 0.023340201005339622, "learning_rate": 8.994249098296502e-07, "loss": 0.0002, "num_tokens": 22191739.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4741108417510986, "sampling/importance_sampling_ratio/mean": 1.0001301765441895, "sampling/importance_sampling_ratio/min": 0.6716850996017456, "sampling/sampling_logp_difference/max": 0.397965669631958, "sampling/sampling_logp_difference/mean": 0.014817805960774422, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 163.5625, "completions/mean_terminated_length": 163.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.43091028928756714, "epoch": 0.8590686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.03130822030377899, "kl": 0.036232396960258484, "learning_rate": 8.989959866888437e-07, "loss": 0.0003, "num_tokens": 22221871.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.785539984703064, "sampling/importance_sampling_ratio/mean": 1.0007494688034058, "sampling/importance_sampling_ratio/min": 0.6233208179473877, "sampling/sampling_logp_difference/max": 0.5797208547592163, "sampling/sampling_logp_difference/mean": 0.017153877764940262, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3444056510925293, "epoch": 0.8602941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.043817633280303046, "kl": 0.040040526539087296, "learning_rate": 8.985662536114612e-07, "loss": 0.0004, "num_tokens": 22251307.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5191256999969482, "sampling/importance_sampling_ratio/mean": 0.9996485710144043, "sampling/importance_sampling_ratio/min": 0.6330731511116028, "sampling/sampling_logp_difference/max": 0.4571692943572998, "sampling/sampling_logp_difference/mean": 0.01527867466211319, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 179.421875, "completions/mean_terminated_length": 179.421875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.283399760723114, "epoch": 0.8615196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.022086809477837064, "kl": 0.0240059532225132, "learning_rate": 8.981357114698338e-07, "loss": 0.0002, "num_tokens": 22284678.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6264933347702026, "sampling/importance_sampling_ratio/mean": 0.9999646544456482, "sampling/importance_sampling_ratio/min": 0.662260890007019, "sampling/sampling_logp_difference/max": 0.48642635345458984, "sampling/sampling_logp_difference/mean": 0.012372169643640518, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 186.6875, "completions/mean_terminated_length": 186.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4073829650878906, "epoch": 0.8627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.021700756893971683, "kl": 0.0328010693192482, "learning_rate": 8.977043611379349e-07, "loss": 0.0003, "num_tokens": 22313810.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.577315330505371, "sampling/importance_sampling_ratio/mean": 1.000518560409546, "sampling/importance_sampling_ratio/min": 0.6396447420120239, "sampling/sampling_logp_difference/max": 0.45572423934936523, "sampling/sampling_logp_difference/mean": 0.0156413447111845, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 201.984375, "completions/mean_terminated_length": 201.984375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.36387914419174194, "epoch": 0.8639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.021705187236821282, "kl": 0.026145074516534805, "learning_rate": 8.972722034913781e-07, "loss": 0.0003, "num_tokens": 22350369.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4937084913253784, "sampling/importance_sampling_ratio/mean": 0.9994643926620483, "sampling/importance_sampling_ratio/min": 0.7072100043296814, "sampling/sampling_logp_difference/max": 0.40126192569732666, "sampling/sampling_logp_difference/mean": 0.015600104816257954, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 170.53125, "completions/mean_terminated_length": 170.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4234261214733124, "epoch": 0.8651960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.020820011210816534, "kl": 0.029060494154691696, "learning_rate": 8.968392394074163e-07, "loss": 0.0003, "num_tokens": 22377347.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6623014211654663, "sampling/importance_sampling_ratio/mean": 1.0004706382751465, "sampling/importance_sampling_ratio/min": 0.6393036246299744, "sampling/sampling_logp_difference/max": 0.5082030296325684, "sampling/sampling_logp_difference/mean": 0.01729530841112137, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 184.671875, "completions/mean_terminated_length": 184.671875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.41309821605682373, "epoch": 0.866421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.028351012774506905, "kl": 0.03086160495877266, "learning_rate": 8.964054697649388e-07, "loss": 0.0003, "num_tokens": 22408254.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.408115267753601, "sampling/importance_sampling_ratio/mean": 1.0000227689743042, "sampling/importance_sampling_ratio/min": 0.6796722412109375, "sampling/sampling_logp_difference/max": 0.38614463806152344, "sampling/sampling_logp_difference/mean": 0.016858208924531937, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.33535701036453247, "epoch": 0.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01831578375235787, "kl": 0.02631985768675804, "learning_rate": 8.959708954444708e-07, "loss": 0.0003, "num_tokens": 22434134.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.618519902229309, "sampling/importance_sampling_ratio/mean": 1.0003108978271484, "sampling/importance_sampling_ratio/min": 0.6791917681694031, "sampling/sampling_logp_difference/max": 0.48151206970214844, "sampling/sampling_logp_difference/mean": 0.014257533475756645, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 172.34375, "completions/mean_terminated_length": 172.34375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4500412344932556, "epoch": 0.8688725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.021215947051883795, "kl": 0.027528800070285797, "learning_rate": 8.955355173281707e-07, "loss": 0.0003, "num_tokens": 22460412.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3008592128753662, "sampling/importance_sampling_ratio/mean": 1.000032901763916, "sampling/importance_sampling_ratio/min": 0.6282380819320679, "sampling/sampling_logp_difference/max": 0.46483612060546875, "sampling/sampling_logp_difference/mean": 0.019712721928954124, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 184.609375, "completions/mean_terminated_length": 184.609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3844994902610779, "epoch": 0.8700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01740447594554791, "kl": 0.02519618347287178, "learning_rate": 8.95099336299828e-07, "loss": 0.0002, "num_tokens": 22491043.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5583192110061646, "sampling/importance_sampling_ratio/mean": 1.000016450881958, "sampling/importance_sampling_ratio/min": 0.5772902369499207, "sampling/sampling_logp_difference/max": 0.5494101643562317, "sampling/sampling_logp_difference/mean": 0.015510272234678268, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 200.609375, "completions/mean_terminated_length": 200.609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.39858540892601013, "epoch": 0.8713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.040457987331105866, "kl": 0.04456058144569397, "learning_rate": 8.946623532448631e-07, "loss": 0.0004, "num_tokens": 22522330.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.495909333229065, "sampling/importance_sampling_ratio/mean": 1.000077486038208, "sampling/importance_sampling_ratio/min": 0.6124499440193176, "sampling/sampling_logp_difference/max": 0.49028801918029785, "sampling/sampling_logp_difference/mean": 0.016100822016596794, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 188.9375, "completions/mean_terminated_length": 188.9375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3326869606971741, "epoch": 0.8725490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.01630484328654411, "kl": 0.024245694279670715, "learning_rate": 8.942245690503238e-07, "loss": 0.0002, "num_tokens": 22550214.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5851231813430786, "sampling/importance_sampling_ratio/mean": 1.0000001192092896, "sampling/importance_sampling_ratio/min": 0.6805460453033447, "sampling/sampling_logp_difference/max": 0.4606621265411377, "sampling/sampling_logp_difference/mean": 0.013762826099991798, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.31201836466789246, "epoch": 0.8737745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.016822303110914344, "kl": 0.02444467693567276, "learning_rate": 8.937859846048842e-07, "loss": 0.0002, "num_tokens": 22579228.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.637949824333191, "sampling/importance_sampling_ratio/mean": 1.0003360509872437, "sampling/importance_sampling_ratio/min": 0.635092556476593, "sampling/sampling_logp_difference/max": 0.49344539642333984, "sampling/sampling_logp_difference/mean": 0.012438100762665272, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 231.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4103919267654419, "epoch": 0.875, "frac_reward_zero_std": 0.75, "grad_norm": 0.6983983579036075, "kl": 0.026166250929236412, "learning_rate": 8.933466007988429e-07, "loss": -0.0295, "num_tokens": 22610808.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.318074107170105, "sampling/importance_sampling_ratio/mean": 0.9994531273841858, "sampling/importance_sampling_ratio/min": 0.6622469425201416, "sampling/sampling_logp_difference/max": 0.41211676597595215, "sampling/sampling_logp_difference/mean": 0.014455530792474747, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 148.421875, "completions/mean_terminated_length": 148.421875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.33341044187545776, "epoch": 0.8762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.02867520429805719, "kl": 0.03824207931756973, "learning_rate": 8.929064185241212e-07, "loss": 0.0003, "num_tokens": 22632435.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5465291738510132, "sampling/importance_sampling_ratio/mean": 0.9997183084487915, "sampling/importance_sampling_ratio/min": 0.7638641595840454, "sampling/sampling_logp_difference/max": 0.43601322174072266, "sampling/sampling_logp_difference/mean": 0.014888597652316093, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 182.21875, "completions/mean_terminated_length": 182.21875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3266327381134033, "epoch": 0.8774509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.018687077576912053, "kl": 0.02686445415019989, "learning_rate": 8.924654386742611e-07, "loss": 0.0002, "num_tokens": 22660817.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6115307807922363, "sampling/importance_sampling_ratio/mean": 0.999614953994751, "sampling/importance_sampling_ratio/min": 0.682086169719696, "sampling/sampling_logp_difference/max": 0.477184534072876, "sampling/sampling_logp_difference/mean": 0.013707519508898258, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 173.8125, "completions/mean_terminated_length": 173.8125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.36124658584594727, "epoch": 0.8786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.01949862419535672, "kl": 0.03010931983590126, "learning_rate": 8.920236621444242e-07, "loss": 0.0003, "num_tokens": 22689605.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4058588743209839, "sampling/importance_sampling_ratio/mean": 0.9999063014984131, "sampling/importance_sampling_ratio/min": 0.6182796359062195, "sampling/sampling_logp_difference/max": 0.48081445693969727, "sampling/sampling_logp_difference/mean": 0.014477972872555256, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4190867245197296, "epoch": 0.8799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.022864716090772563, "kl": 0.02433510683476925, "learning_rate": 8.915810898313884e-07, "loss": 0.0002, "num_tokens": 22720873.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3492519855499268, "sampling/importance_sampling_ratio/mean": 0.9995954632759094, "sampling/importance_sampling_ratio/min": 0.695496678352356, "sampling/sampling_logp_difference/max": 0.36312901973724365, "sampling/sampling_logp_difference/mean": 0.016453402116894722, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4043603539466858, "epoch": 0.8811274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.02135247048031107, "kl": 0.024310391396284103, "learning_rate": 8.911377226335478e-07, "loss": 0.0002, "num_tokens": 22756881.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3832709789276123, "sampling/importance_sampling_ratio/mean": 1.0000874996185303, "sampling/importance_sampling_ratio/min": 0.7845126986503601, "sampling/sampling_logp_difference/max": 0.3244509696960449, "sampling/sampling_logp_difference/mean": 0.014155912213027477, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 210.96875, "completions/mean_terminated_length": 210.96875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3904402256011963, "epoch": 0.8823529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.8236928016307357, "kl": 0.031219035387039185, "learning_rate": 8.906935614509095e-07, "loss": -0.0014, "num_tokens": 22786095.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4844071865081787, "sampling/importance_sampling_ratio/mean": 1.0000147819519043, "sampling/importance_sampling_ratio/min": 0.5703468918800354, "sampling/sampling_logp_difference/max": 0.5615105628967285, "sampling/sampling_logp_difference/mean": 0.015508387237787247, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 203.671875, "completions/mean_terminated_length": 203.671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.5759321451187134, "epoch": 0.883578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.027769155470801124, "kl": 0.03423618897795677, "learning_rate": 8.902486071850926e-07, "loss": 0.0003, "num_tokens": 22822042.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5285859107971191, "sampling/importance_sampling_ratio/mean": 0.999665379524231, "sampling/importance_sampling_ratio/min": 0.7155121564865112, "sampling/sampling_logp_difference/max": 0.4243431091308594, "sampling/sampling_logp_difference/mean": 0.01961427554488182, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 186.640625, "completions/mean_terminated_length": 186.640625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4677279591560364, "epoch": 0.8848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02152396847738153, "kl": 0.027463074773550034, "learning_rate": 8.89802860739326e-07, "loss": 0.0003, "num_tokens": 22854691.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2895188331604004, "sampling/importance_sampling_ratio/mean": 1.0000085830688477, "sampling/importance_sampling_ratio/min": 0.6262801289558411, "sampling/sampling_logp_difference/max": 0.4679574966430664, "sampling/sampling_logp_difference/mean": 0.01694682240486145, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 236.34375, "completions/mean_terminated_length": 236.34375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3707200884819031, "epoch": 0.8860294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02839822055715672, "kl": 0.025810226798057556, "learning_rate": 8.89356323018447e-07, "loss": 0.0002, "num_tokens": 22889017.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5062004327774048, "sampling/importance_sampling_ratio/mean": 1.0002585649490356, "sampling/importance_sampling_ratio/min": 0.608048677444458, "sampling/sampling_logp_difference/max": 0.4975004196166992, "sampling/sampling_logp_difference/mean": 0.013876675628125668, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 171.015625, "completions/mean_terminated_length": 171.015625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.43813496828079224, "epoch": 0.8872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.9518328097802756, "kl": 0.043780501931905746, "learning_rate": 8.889089949288986e-07, "loss": 0.0065, "num_tokens": 22914074.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6135611534118652, "sampling/importance_sampling_ratio/mean": 1.0000569820404053, "sampling/importance_sampling_ratio/min": 0.7246251106262207, "sampling/sampling_logp_difference/max": 0.47844362258911133, "sampling/sampling_logp_difference/mean": 0.01806015707552433, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 190.234375, "completions/mean_terminated_length": 190.234375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2979552745819092, "epoch": 0.8884803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.021359864558259008, "kl": 0.025271622464060783, "learning_rate": 8.884608773787288e-07, "loss": 0.0002, "num_tokens": 22939689.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.632039189338684, "sampling/importance_sampling_ratio/mean": 0.9996598362922668, "sampling/importance_sampling_ratio/min": 0.6387041211128235, "sampling/sampling_logp_difference/max": 0.48983025550842285, "sampling/sampling_logp_difference/mean": 0.013333464972674847, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 201.703125, "completions/mean_terminated_length": 201.703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.40966618061065674, "epoch": 0.8897058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.019191291626506117, "kl": 0.025096643716096878, "learning_rate": 8.880119712775875e-07, "loss": 0.0002, "num_tokens": 22971030.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6651651859283447, "sampling/importance_sampling_ratio/mean": 0.9998248219490051, "sampling/importance_sampling_ratio/min": 0.6176443696022034, "sampling/sampling_logp_difference/max": 0.5099244117736816, "sampling/sampling_logp_difference/mean": 0.014878641813993454, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 238.546875, "completions/mean_terminated_length": 238.546875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3977225422859192, "epoch": 0.8909313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.018954677765834885, "kl": 0.027560057118535042, "learning_rate": 8.875622775367259e-07, "loss": 0.0003, "num_tokens": 23002217.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5309392213821411, "sampling/importance_sampling_ratio/mean": 1.0005265474319458, "sampling/importance_sampling_ratio/min": 0.6823685169219971, "sampling/sampling_logp_difference/max": 0.42588138580322266, "sampling/sampling_logp_difference/mean": 0.014687771908938885, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 245.65625, "completions/mean_terminated_length": 245.65625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4387362003326416, "epoch": 0.8921568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.01755373771492126, "kl": 0.01948336698114872, "learning_rate": 8.871117970689937e-07, "loss": 0.0002, "num_tokens": 23036963.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5965895652770996, "sampling/importance_sampling_ratio/mean": 1.0000070333480835, "sampling/importance_sampling_ratio/min": 0.6271955370903015, "sampling/sampling_logp_difference/max": 0.46786975860595703, "sampling/sampling_logp_difference/mean": 0.01525543536990881, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 187.3125, "completions/mean_terminated_length": 187.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2976965308189392, "epoch": 0.8933823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.01756662120639696, "kl": 0.022048771381378174, "learning_rate": 8.866605307888376e-07, "loss": 0.0002, "num_tokens": 23065943.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5641213655471802, "sampling/importance_sampling_ratio/mean": 1.000281572341919, "sampling/importance_sampling_ratio/min": 0.6271852254867554, "sampling/sampling_logp_difference/max": 0.46651339530944824, "sampling/sampling_logp_difference/mean": 0.012067212723195553, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 197.359375, "completions/mean_terminated_length": 197.359375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.373897910118103, "epoch": 0.8946078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.014672301147486449, "kl": 0.019550330936908722, "learning_rate": 8.862084796122997e-07, "loss": 0.0002, "num_tokens": 23097790.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6513066291809082, "sampling/importance_sampling_ratio/mean": 1.0000605583190918, "sampling/importance_sampling_ratio/min": 0.6707313060760498, "sampling/sampling_logp_difference/max": 0.5015668869018555, "sampling/sampling_logp_difference/mean": 0.015741368755698204, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 179.703125, "completions/mean_terminated_length": 179.703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.46061819791793823, "epoch": 0.8958333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.02386264189946869, "kl": 0.02802673727273941, "learning_rate": 8.857556444570153e-07, "loss": 0.0003, "num_tokens": 23127067.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.405434250831604, "sampling/importance_sampling_ratio/mean": 0.9999182224273682, "sampling/importance_sampling_ratio/min": 0.6932373642921448, "sampling/sampling_logp_difference/max": 0.3663828372955322, "sampling/sampling_logp_difference/mean": 0.01652100682258606, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 206.84375, "completions/mean_terminated_length": 206.84375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3515208959579468, "epoch": 0.8970588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.019219332441676965, "kl": 0.02097495086491108, "learning_rate": 8.853020262422109e-07, "loss": 0.0002, "num_tokens": 23154481.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002868175506592, "sampling/importance_sampling_ratio/min": 0.6202810406684875, "sampling/sampling_logp_difference/max": 0.7561249732971191, "sampling/sampling_logp_difference/mean": 0.014154670760035515, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 219.390625, "completions/mean_terminated_length": 219.390625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.301144003868103, "epoch": 0.8982843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8816825728555113, "kl": 0.02029554918408394, "learning_rate": 8.84847625888703e-07, "loss": -0.0061, "num_tokens": 23189914.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3341046571731567, "sampling/importance_sampling_ratio/mean": 1.000375509262085, "sampling/importance_sampling_ratio/min": 0.6811178922653198, "sampling/sampling_logp_difference/max": 0.3840198516845703, "sampling/sampling_logp_difference/mean": 0.012259600684046745, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3824283480644226, "epoch": 0.8995098039215687, "frac_reward_zero_std": 0.75, "grad_norm": 1.0885734340783775, "kl": 0.027387483045458794, "learning_rate": 8.843924443188953e-07, "loss": -0.0531, "num_tokens": 23221278.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5175509452819824, "sampling/importance_sampling_ratio/mean": 0.9999219179153442, "sampling/importance_sampling_ratio/min": 0.6095960140228271, "sampling/sampling_logp_difference/max": 0.49495887756347656, "sampling/sampling_logp_difference/mean": 0.014254633337259293, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.40970197319984436, "epoch": 0.9007352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02078595852834981, "kl": 0.03226561099290848, "learning_rate": 8.839364824567775e-07, "loss": 0.0003, "num_tokens": 23251690.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.9999431371688843, "sampling/importance_sampling_ratio/min": 0.6202805638313293, "sampling/sampling_logp_difference/max": 0.4775834083557129, "sampling/sampling_logp_difference/mean": 0.016174018383026123, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 260.203125, "completions/mean_terminated_length": 260.203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4016256630420685, "epoch": 0.9019607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.7728898684931877, "kl": 0.03287380188703537, "learning_rate": 8.834797412279235e-07, "loss": -0.0102, "num_tokens": 23289127.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.9951763153076172, "sampling/importance_sampling_ratio/mean": 0.9996899366378784, "sampling/importance_sampling_ratio/min": 0.42026883363723755, "sampling/sampling_logp_difference/max": 0.8668606281280518, "sampling/sampling_logp_difference/mean": 0.014560644514858723, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 150.21875, "completions/mean_terminated_length": 150.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.29902899265289307, "epoch": 0.9031862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.027955462755797637, "kl": 0.03348477557301521, "learning_rate": 8.83022221559489e-07, "loss": 0.0003, "num_tokens": 23313061.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6089363098144531, "sampling/importance_sampling_ratio/mean": 0.9994817972183228, "sampling/importance_sampling_ratio/min": 0.417529821395874, "sampling/sampling_logp_difference/max": 0.8733993172645569, "sampling/sampling_logp_difference/mean": 0.014205368235707283, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 192.84375, "completions/mean_terminated_length": 192.84375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3875751793384552, "epoch": 0.9044117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.03185755440779379, "kl": 0.026237674057483673, "learning_rate": 8.825639243802098e-07, "loss": 0.0003, "num_tokens": 23347659.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4153696298599243, "sampling/importance_sampling_ratio/mean": 0.9998494386672974, "sampling/importance_sampling_ratio/min": 0.6408570408821106, "sampling/sampling_logp_difference/max": 0.4449489116668701, "sampling/sampling_logp_difference/mean": 0.016096360981464386, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2933084964752197, "epoch": 0.9056372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.023886017411667487, "kl": 0.028503376990556717, "learning_rate": 8.821048506204005e-07, "loss": 0.0003, "num_tokens": 23375371.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5751630067825317, "sampling/importance_sampling_ratio/mean": 0.9999306201934814, "sampling/importance_sampling_ratio/min": 0.610644519329071, "sampling/sampling_logp_difference/max": 0.4932403564453125, "sampling/sampling_logp_difference/mean": 0.013522179797291756, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 257.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.42164430022239685, "epoch": 0.9068627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.7956254619530746, "kl": 0.02659602463245392, "learning_rate": 8.816450012119513e-07, "loss": -0.0001, "num_tokens": 23415395.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6588774919509888, "sampling/importance_sampling_ratio/mean": 1.0001760721206665, "sampling/importance_sampling_ratio/min": 0.5339854955673218, "sampling/sampling_logp_difference/max": 0.6273865699768066, "sampling/sampling_logp_difference/mean": 0.015276629477739334, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 188.8125, "completions/mean_terminated_length": 188.8125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3506987690925598, "epoch": 0.9080882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.026828614541766244, "kl": 0.024205820634961128, "learning_rate": 8.811843770883276e-07, "loss": 0.0002, "num_tokens": 23445351.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4376978874206543, "sampling/importance_sampling_ratio/mean": 0.9997704029083252, "sampling/importance_sampling_ratio/min": 0.705346405506134, "sampling/sampling_logp_difference/max": 0.36304306983947754, "sampling/sampling_logp_difference/mean": 0.01479363813996315, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 299.796875, "completions/mean_terminated_length": 299.796875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.35800766944885254, "epoch": 0.9093137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.633406713428934, "kl": 0.02546757273375988, "learning_rate": 8.807229791845671e-07, "loss": -0.0173, "num_tokens": 23484266.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000248670578003, "sampling/importance_sampling_ratio/min": 0.572018563747406, "sampling/sampling_logp_difference/max": 1.0540895462036133, "sampling/sampling_logp_difference/mean": 0.014206867665052414, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 251.515625, "completions/mean_terminated_length": 251.515625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.29566600918769836, "epoch": 0.9105392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7766493417323004, "kl": 0.03525124117732048, "learning_rate": 8.802608084372785e-07, "loss": 0.0325, "num_tokens": 23521643.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4382350444793701, "sampling/importance_sampling_ratio/mean": 0.9998162984848022, "sampling/importance_sampling_ratio/min": 0.6290534138679504, "sampling/sampling_logp_difference/max": 0.46353912353515625, "sampling/sampling_logp_difference/mean": 0.012025854550302029, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 208.28125, "completions/mean_terminated_length": 208.28125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.33061474561691284, "epoch": 0.9117647058823529, "frac_reward_zero_std": 1.0, "grad_norm": 0.027698715442733026, "kl": 0.02474607154726982, "learning_rate": 8.79797865784639e-07, "loss": 0.0002, "num_tokens": 23552013.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.527300477027893, "sampling/importance_sampling_ratio/mean": 0.9997114539146423, "sampling/importance_sampling_ratio/min": 0.6178549528121948, "sampling/sampling_logp_difference/max": 0.48150157928466797, "sampling/sampling_logp_difference/mean": 0.013280758634209633, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 193.859375, "completions/mean_terminated_length": 193.859375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3518263101577759, "epoch": 0.9129901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.024194303920341127, "kl": 0.025767751038074493, "learning_rate": 8.793341521663928e-07, "loss": 0.0003, "num_tokens": 23583124.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.593743085861206, "sampling/importance_sampling_ratio/mean": 0.9996703267097473, "sampling/importance_sampling_ratio/min": 0.6124204993247986, "sampling/sampling_logp_difference/max": 0.49033617973327637, "sampling/sampling_logp_difference/mean": 0.014404002577066422, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 214.96875, "completions/mean_terminated_length": 214.96875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3150392770767212, "epoch": 0.9142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.027045237899341455, "kl": 0.021665578708052635, "learning_rate": 8.788696685238494e-07, "loss": 0.0002, "num_tokens": 23615218.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5703420639038086, "sampling/importance_sampling_ratio/mean": 0.9998134970664978, "sampling/importance_sampling_ratio/min": 0.5153923630714417, "sampling/sampling_logp_difference/max": 0.6628267765045166, "sampling/sampling_logp_difference/mean": 0.01464426051825285, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 179.203125, "completions/mean_terminated_length": 179.203125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2582736611366272, "epoch": 0.9154411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.02714215186518947, "kl": 0.02816140279173851, "learning_rate": 8.784044157998809e-07, "loss": 0.0003, "num_tokens": 23641231.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5971472263336182, "sampling/importance_sampling_ratio/mean": 1.0004371404647827, "sampling/importance_sampling_ratio/min": 0.6366870403289795, "sampling/sampling_logp_difference/max": 0.4682190418243408, "sampling/sampling_logp_difference/mean": 0.01274982187896967, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 255.578125, "completions/mean_terminated_length": 255.578125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4026065468788147, "epoch": 0.9166666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.024676608713910596, "kl": 0.03214331716299057, "learning_rate": 8.779383949389208e-07, "loss": 0.0003, "num_tokens": 23676740.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5278124809265137, "sampling/importance_sampling_ratio/mean": 0.9997199177742004, "sampling/importance_sampling_ratio/min": 0.6298378109931946, "sampling/sampling_logp_difference/max": 0.4622929096221924, "sampling/sampling_logp_difference/mean": 0.015497921034693718, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 154.859375, "completions/mean_terminated_length": 154.859375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27072569727897644, "epoch": 0.9178921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0374075047980671, "kl": 0.025277458131313324, "learning_rate": 8.774716068869623e-07, "loss": 0.0002, "num_tokens": 23702971.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6152385473251343, "sampling/importance_sampling_ratio/mean": 0.9997353553771973, "sampling/importance_sampling_ratio/min": 0.6209545135498047, "sampling/sampling_logp_difference/max": 0.47948265075683594, "sampling/sampling_logp_difference/mean": 0.013536088168621063, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 264.421875, "completions/mean_terminated_length": 264.421875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4094110131263733, "epoch": 0.9191176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.013892030016520967, "kl": 0.01472385972738266, "learning_rate": 8.770040525915553e-07, "loss": 0.0001, "num_tokens": 23750198.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000056505203247, "sampling/importance_sampling_ratio/min": 0.6676850318908691, "sampling/sampling_logp_difference/max": 0.822901725769043, "sampling/sampling_logp_difference/mean": 0.014736359938979149, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 212.640625, "completions/mean_terminated_length": 212.640625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2965674102306366, "epoch": 0.9203431372549019, "frac_reward_zero_std": 0.75, "grad_norm": 0.9046636615662998, "kl": 0.02484903112053871, "learning_rate": 8.765357330018055e-07, "loss": 0.0088, "num_tokens": 23780591.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5266870260238647, "sampling/importance_sampling_ratio/mean": 1.0007375478744507, "sampling/importance_sampling_ratio/min": 0.4810182452201843, "sampling/sampling_logp_difference/max": 0.7318501472473145, "sampling/sampling_logp_difference/mean": 0.013828756287693977, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 262.796875, "completions/mean_terminated_length": 262.796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.33337122201919556, "epoch": 0.9215686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.02823034514643041, "kl": 0.021070297807455063, "learning_rate": 8.760666490683719e-07, "loss": 0.0002, "num_tokens": 23814066.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5928850173950195, "sampling/importance_sampling_ratio/mean": 1.0001671314239502, "sampling/importance_sampling_ratio/min": 0.6269896626472473, "sampling/sampling_logp_difference/max": 0.4668252468109131, "sampling/sampling_logp_difference/mean": 0.012249452993273735, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 187.96875, "completions/mean_terminated_length": 187.96875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4141361713409424, "epoch": 0.9227941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.03437685367679887, "kl": 0.03887910395860672, "learning_rate": 8.755968017434651e-07, "loss": 0.0004, "num_tokens": 23842960.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5015095472335815, "sampling/importance_sampling_ratio/mean": 1.0005022287368774, "sampling/importance_sampling_ratio/min": 0.6262646317481995, "sampling/sampling_logp_difference/max": 0.46798229217529297, "sampling/sampling_logp_difference/mean": 0.018394464626908302, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 214.953125, "completions/mean_terminated_length": 214.953125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3201451897621155, "epoch": 0.9240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.02504827027221983, "kl": 0.0242147259414196, "learning_rate": 8.751261919808457e-07, "loss": 0.0002, "num_tokens": 23877293.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9370697736740112, "sampling/importance_sampling_ratio/mean": 1.000178575515747, "sampling/importance_sampling_ratio/min": 0.4793161451816559, "sampling/sampling_logp_difference/max": 0.7353949546813965, "sampling/sampling_logp_difference/mean": 0.014398392289876938, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 232.921875, "completions/mean_terminated_length": 232.921875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2768540382385254, "epoch": 0.9252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.016273019864649428, "kl": 0.020013771951198578, "learning_rate": 8.746548207358215e-07, "loss": 0.0002, "num_tokens": 23915560.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5176751613616943, "sampling/importance_sampling_ratio/mean": 1.0000295639038086, "sampling/importance_sampling_ratio/min": 0.5838847756385803, "sampling/sampling_logp_difference/max": 0.5380516052246094, "sampling/sampling_logp_difference/mean": 0.012251168489456177, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 225.953125, "completions/mean_terminated_length": 225.953125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.40648049116134644, "epoch": 0.9264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.03658944231433625, "kl": 0.03250942379236221, "learning_rate": 8.741826889652463e-07, "loss": 0.0003, "num_tokens": 23953125.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6207877397537231, "sampling/importance_sampling_ratio/mean": 0.9998102784156799, "sampling/importance_sampling_ratio/min": 0.660457193851471, "sampling/sampling_logp_difference/max": 0.4829123020172119, "sampling/sampling_logp_difference/mean": 0.015902938321232796, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 180.171875, "completions/mean_terminated_length": 180.171875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.31027036905288696, "epoch": 0.9276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.018178486628944335, "kl": 0.02138359844684601, "learning_rate": 8.737097976275176e-07, "loss": 0.0002, "num_tokens": 23980368.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4353374242782593, "sampling/importance_sampling_ratio/mean": 1.0002868175506592, "sampling/importance_sampling_ratio/min": 0.6369235515594482, "sampling/sampling_logp_difference/max": 0.45110559463500977, "sampling/sampling_logp_difference/mean": 0.013565674424171448, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 226.859375, "completions/mean_terminated_length": 226.859375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3295559883117676, "epoch": 0.928921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.015841653806567234, "kl": 0.021238170564174652, "learning_rate": 8.73236147682575e-07, "loss": 0.0002, "num_tokens": 24020183.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7261039018630981, "sampling/importance_sampling_ratio/mean": 1.0000017881393433, "sampling/importance_sampling_ratio/min": 0.6048555374145508, "sampling/sampling_logp_difference/max": 0.5458667278289795, "sampling/sampling_logp_difference/mean": 0.01359205599874258, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3196815252304077, "epoch": 0.9301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020711053312641645, "kl": 0.02678016386926174, "learning_rate": 8.727617400918978e-07, "loss": 0.0003, "num_tokens": 24054979.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7527730464935303, "sampling/importance_sampling_ratio/mean": 1.0000622272491455, "sampling/importance_sampling_ratio/min": 0.638654351234436, "sampling/sampling_logp_difference/max": 0.5611990690231323, "sampling/sampling_logp_difference/mean": 0.013337301090359688, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 207.40625, "completions/mean_terminated_length": 207.40625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.41590484976768494, "epoch": 0.9313725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.025386860324877646, "kl": 0.023337356746196747, "learning_rate": 8.722865758185035e-07, "loss": 0.0002, "num_tokens": 24086061.0, "reward": -1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": -1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0007213354110718, "sampling/importance_sampling_ratio/min": 0.47247007489204407, "sampling/sampling_logp_difference/max": 0.824960470199585, "sampling/sampling_logp_difference/mean": 0.016363630071282387, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 228.625, "completions/mean_terminated_length": 228.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3403834402561188, "epoch": 0.9325980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7640502987624201, "kl": 0.021827127784490585, "learning_rate": 8.718106558269452e-07, "loss": 0.0158, "num_tokens": 24119653.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5976622104644775, "sampling/importance_sampling_ratio/mean": 0.9999210238456726, "sampling/importance_sampling_ratio/min": 0.608597457408905, "sampling/sampling_logp_difference/max": 0.4965982437133789, "sampling/sampling_logp_difference/mean": 0.012747423723340034, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 166.296875, "completions/mean_terminated_length": 166.296875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3273610770702362, "epoch": 0.9338235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02234249527444986, "kl": 0.023769231513142586, "learning_rate": 8.713339810833105e-07, "loss": 0.0002, "num_tokens": 24143720.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5283203125, "sampling/importance_sampling_ratio/mean": 0.99969881772995, "sampling/importance_sampling_ratio/min": 0.6343420147895813, "sampling/sampling_logp_difference/max": 0.4551670551300049, "sampling/sampling_logp_difference/mean": 0.01511424034833908, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 194.21875, "completions/mean_terminated_length": 194.21875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.34374314546585083, "epoch": 0.9350490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.03413304169673714, "kl": 0.02705732360482216, "learning_rate": 8.708565525552189e-07, "loss": 0.0003, "num_tokens": 24174630.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.545061707496643, "sampling/importance_sampling_ratio/mean": 1.0002961158752441, "sampling/importance_sampling_ratio/min": 0.6778910756111145, "sampling/sampling_logp_difference/max": 0.43506383895874023, "sampling/sampling_logp_difference/mean": 0.014042122289538383, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 225.171875, "completions/mean_terminated_length": 225.171875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3956797122955322, "epoch": 0.9362745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.9027058283169005, "kl": 0.02620890364050865, "learning_rate": 8.703783712118202e-07, "loss": -0.0312, "num_tokens": 24212449.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4614901542663574, "sampling/importance_sampling_ratio/mean": 0.9998257160186768, "sampling/importance_sampling_ratio/min": 0.6307316422462463, "sampling/sampling_logp_difference/max": 0.4608747959136963, "sampling/sampling_logp_difference/mean": 0.015541866421699524, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3058358430862427, "epoch": 0.9375, "frac_reward_zero_std": 1.0, "grad_norm": 0.020757007458023786, "kl": 0.02144656702876091, "learning_rate": 8.69899438023792e-07, "loss": 0.0002, "num_tokens": 24240967.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9106395244598389, "sampling/importance_sampling_ratio/mean": 1.000170350074768, "sampling/importance_sampling_ratio/min": 0.40289777517318726, "sampling/sampling_logp_difference/max": 0.9090723991394043, "sampling/sampling_logp_difference/mean": 0.0138449901714921, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 250.9375, "completions/mean_terminated_length": 250.9375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.271751344203949, "epoch": 0.9387254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.012337520673509015, "kl": 0.016083460301160812, "learning_rate": 8.694197539633385e-07, "loss": 0.0001, "num_tokens": 24277011.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.406278133392334, "sampling/importance_sampling_ratio/mean": 1.0002005100250244, "sampling/importance_sampling_ratio/min": 0.6132686734199524, "sampling/sampling_logp_difference/max": 0.4889521598815918, "sampling/sampling_logp_difference/mean": 0.011555547825992107, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 167.65625, "completions/mean_terminated_length": 167.65625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3901594281196594, "epoch": 0.9399509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02606277068546319, "kl": 0.02786336839199066, "learning_rate": 8.689393200041878e-07, "loss": 0.0003, "num_tokens": 24307549.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6091108322143555, "sampling/importance_sampling_ratio/mean": 1.0003968477249146, "sampling/importance_sampling_ratio/min": 0.6550332903862, "sampling/sampling_logp_difference/max": 0.47568178176879883, "sampling/sampling_logp_difference/mean": 0.015018296428024769, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 192.0, "completions/mean_terminated_length": 192.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4136354327201843, "epoch": 0.9411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.8874129279894816, "kl": 0.029054976999759674, "learning_rate": 8.684581371215904e-07, "loss": -0.0202, "num_tokens": 24343789.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3191401958465576, "sampling/importance_sampling_ratio/mean": 1.0002342462539673, "sampling/importance_sampling_ratio/min": 0.474845290184021, "sampling/sampling_logp_difference/max": 0.7447662353515625, "sampling/sampling_logp_difference/mean": 0.01565386727452278, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 194.34375, "completions/mean_terminated_length": 194.34375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.36488449573516846, "epoch": 0.9424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7283355799647103, "kl": 0.03226052224636078, "learning_rate": 8.679762062923175e-07, "loss": 0.034, "num_tokens": 24372499.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5199317932128906, "sampling/importance_sampling_ratio/mean": 1.0011816024780273, "sampling/importance_sampling_ratio/min": 0.6558409929275513, "sampling/sampling_logp_difference/max": 0.42183685302734375, "sampling/sampling_logp_difference/mean": 0.015518213622272015, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 176.09375, "completions/mean_terminated_length": 176.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.34749341011047363, "epoch": 0.9436274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.02971181417815214, "kl": 0.02474549412727356, "learning_rate": 8.674935284946576e-07, "loss": 0.0002, "num_tokens": 24396969.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.292677640914917, "sampling/importance_sampling_ratio/mean": 0.9994853138923645, "sampling/importance_sampling_ratio/min": 0.7128995060920715, "sampling/sampling_logp_difference/max": 0.33841484785079956, "sampling/sampling_logp_difference/mean": 0.015015466138720512, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 157.296875, "completions/mean_terminated_length": 157.296875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.31486156582832336, "epoch": 0.9448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.026017777114247995, "kl": 0.027308259159326553, "learning_rate": 8.670101047084162e-07, "loss": 0.0003, "num_tokens": 24422348.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9757723808288574, "sampling/importance_sampling_ratio/mean": 0.9998023509979248, "sampling/importance_sampling_ratio/min": 0.4160304367542267, "sampling/sampling_logp_difference/max": 0.8769968152046204, "sampling/sampling_logp_difference/mean": 0.014564323239028454, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.38905900716781616, "epoch": 0.946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.9039089435812095, "kl": 0.024624085053801537, "learning_rate": 8.66525935914913e-07, "loss": 0.0136, "num_tokens": 24449128.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4844878911972046, "sampling/importance_sampling_ratio/mean": 0.9995477795600891, "sampling/importance_sampling_ratio/min": 0.6306890845298767, "sampling/sampling_logp_difference/max": 0.46094226837158203, "sampling/sampling_logp_difference/mean": 0.016147365793585777, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 185.03125, "completions/mean_terminated_length": 185.03125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.340989351272583, "epoch": 0.9473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02769537574442187, "kl": 0.02422124333679676, "learning_rate": 8.660410230969804e-07, "loss": 0.0002, "num_tokens": 24477034.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4221079349517822, "sampling/importance_sampling_ratio/mean": 0.9997546672821045, "sampling/importance_sampling_ratio/min": 0.6175847053527832, "sampling/sampling_logp_difference/max": 0.48193907737731934, "sampling/sampling_logp_difference/mean": 0.01526421494781971, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 147.140625, "completions/mean_terminated_length": 147.140625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.34744322299957275, "epoch": 0.9485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02571645606738767, "kl": 0.02424619346857071, "learning_rate": 8.655553672389599e-07, "loss": 0.0002, "num_tokens": 24502595.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6170932054519653, "sampling/importance_sampling_ratio/mean": 1.0006287097930908, "sampling/importance_sampling_ratio/min": 0.6371588706970215, "sampling/sampling_logp_difference/max": 0.48063015937805176, "sampling/sampling_logp_difference/mean": 0.01588124781847, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 168.5625, "completions/mean_terminated_length": 168.5625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.4033295214176178, "epoch": 0.9497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.026614461560211557, "kl": 0.02627166360616684, "learning_rate": 8.650689693267026e-07, "loss": 0.0003, "num_tokens": 24536119.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071913003921509, "sampling/importance_sampling_ratio/mean": 0.9998084306716919, "sampling/importance_sampling_ratio/min": 0.6212287545204163, "sampling/sampling_logp_difference/max": 0.4760558605194092, "sampling/sampling_logp_difference/mean": 0.01671476662158966, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 182.03125, "completions/mean_terminated_length": 182.03125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3922005891799927, "epoch": 0.9509803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 1.1076741310683296, "kl": 0.03362289443612099, "learning_rate": 8.645818303475654e-07, "loss": -0.0317, "num_tokens": 24565225.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3488740921020508, "sampling/importance_sampling_ratio/mean": 0.9998282194137573, "sampling/importance_sampling_ratio/min": 0.6369741559028625, "sampling/sampling_logp_difference/max": 0.45102620124816895, "sampling/sampling_logp_difference/mean": 0.015504934825003147, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 137.28125, "completions/mean_terminated_length": 137.28125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.33687686920166016, "epoch": 0.9522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.03624148232675531, "kl": 0.031308457255363464, "learning_rate": 8.640939512904095e-07, "loss": 0.0003, "num_tokens": 24593835.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5006722211837769, "sampling/importance_sampling_ratio/mean": 0.9992250204086304, "sampling/importance_sampling_ratio/min": 0.7104525566101074, "sampling/sampling_logp_difference/max": 0.4059131145477295, "sampling/sampling_logp_difference/mean": 0.015481984242796898, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 161.328125, "completions/mean_terminated_length": 161.328125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.36610496044158936, "epoch": 0.9534313725490197, "frac_reward_zero_std": 1.0, "grad_norm": 0.02376816592999184, "kl": 0.02393110655248165, "learning_rate": 8.636053331455986e-07, "loss": 0.0002, "num_tokens": 24622064.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744339227676392, "sampling/importance_sampling_ratio/mean": 0.9991030097007751, "sampling/importance_sampling_ratio/min": 0.6132981777191162, "sampling/sampling_logp_difference/max": 0.4889039993286133, "sampling/sampling_logp_difference/mean": 0.016807250678539276, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 176.859375, "completions/mean_terminated_length": 176.859375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3858628273010254, "epoch": 0.9546568627450981, "frac_reward_zero_std": 1.0, "grad_norm": 0.03668443118356197, "kl": 0.0322355255484581, "learning_rate": 8.631159769049964e-07, "loss": 0.0003, "num_tokens": 24654519.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999144673347473, "sampling/importance_sampling_ratio/min": 0.5613397359848022, "sampling/sampling_logp_difference/max": 0.8548152446746826, "sampling/sampling_logp_difference/mean": 0.01737913489341736, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 190.203125, "completions/mean_terminated_length": 190.203125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.36571139097213745, "epoch": 0.9558823529411765, "frac_reward_zero_std": 0.75, "grad_norm": 0.9619406119505388, "kl": 0.03215567767620087, "learning_rate": 8.626258835619653e-07, "loss": 0.0087, "num_tokens": 24682900.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4509612321853638, "sampling/importance_sampling_ratio/mean": 0.99996018409729, "sampling/importance_sampling_ratio/min": 0.5384482741355896, "sampling/sampling_logp_difference/max": 0.6190638542175293, "sampling/sampling_logp_difference/mean": 0.015748433768749237, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 221.984375, "completions/mean_terminated_length": 221.984375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3734496831893921, "epoch": 0.9571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.023910769326798156, "kl": 0.024634193629026413, "learning_rate": 8.621350541113636e-07, "loss": 0.0002, "num_tokens": 24715219.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3035078048706055, "sampling/importance_sampling_ratio/mean": 0.9995801448822021, "sampling/importance_sampling_ratio/min": 0.6926900148391724, "sampling/sampling_logp_difference/max": 0.3671727180480957, "sampling/sampling_logp_difference/mean": 0.014857929199934006, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 179.96875, "completions/mean_terminated_length": 179.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.31734922528266907, "epoch": 0.9583333333333334, "frac_reward_zero_std": 1.0, "grad_norm": 0.022605097620998472, "kl": 0.024092108011245728, "learning_rate": 8.616434895495439e-07, "loss": 0.0002, "num_tokens": 24740017.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4312864542007446, "sampling/importance_sampling_ratio/mean": 1.0001903772354126, "sampling/importance_sampling_ratio/min": 0.6522687077522278, "sampling/sampling_logp_difference/max": 0.4272986650466919, "sampling/sampling_logp_difference/mean": 0.014620056375861168, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 197.671875, "completions/mean_terminated_length": 197.671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.368533730506897, "epoch": 0.9595588235294118, "frac_reward_zero_std": 1.0, "grad_norm": 0.03031176642955676, "kl": 0.02464437112212181, "learning_rate": 8.611511908743514e-07, "loss": 0.0002, "num_tokens": 24767324.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6221281290054321, "sampling/importance_sampling_ratio/mean": 1.0000519752502441, "sampling/importance_sampling_ratio/min": 0.6308545470237732, "sampling/sampling_logp_difference/max": 0.48373889923095703, "sampling/sampling_logp_difference/mean": 0.015080577693879604, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 187.6875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3149415850639343, "epoch": 0.9607843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.022615322522914705, "kl": 0.031320638954639435, "learning_rate": 8.606581590851208e-07, "loss": 0.0002, "num_tokens": 24794376.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4683986902236938, "sampling/importance_sampling_ratio/mean": 0.999013364315033, "sampling/importance_sampling_ratio/min": 0.6115894913673401, "sampling/sampling_logp_difference/max": 0.49169397354125977, "sampling/sampling_logp_difference/mean": 0.01463400200009346, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 203.390625, "completions/mean_terminated_length": 203.390625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4158928096294403, "epoch": 0.9620098039215687, "frac_reward_zero_std": 1.0, "grad_norm": 0.017029810644211132, "kl": 0.02237580344080925, "learning_rate": 8.601643951826758e-07, "loss": 0.0002, "num_tokens": 24827121.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4754341840744019, "sampling/importance_sampling_ratio/mean": 0.9997149705886841, "sampling/importance_sampling_ratio/min": 0.63714200258255, "sampling/sampling_logp_difference/max": 0.4507627487182617, "sampling/sampling_logp_difference/mean": 0.015549298375844955, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 169.078125, "completions/mean_terminated_length": 169.078125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3622353971004486, "epoch": 0.9632352941176471, "frac_reward_zero_std": 1.0, "grad_norm": 0.02681246787418524, "kl": 0.025942612439393997, "learning_rate": 8.596699001693255e-07, "loss": 0.0003, "num_tokens": 24854198.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6397879123687744, "sampling/importance_sampling_ratio/mean": 0.9995217323303223, "sampling/importance_sampling_ratio/min": 0.6080277562141418, "sampling/sampling_logp_difference/max": 0.49753475189208984, "sampling/sampling_logp_difference/mean": 0.014673823490738869, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 229.984375, "completions/mean_terminated_length": 229.984375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.39878255128860474, "epoch": 0.9644607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.7017346655063545, "kl": 0.03290364891290665, "learning_rate": 8.591746750488637e-07, "loss": 0.0012, "num_tokens": 24888629.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.7453142404556274, "sampling/importance_sampling_ratio/mean": 0.9994145631790161, "sampling/importance_sampling_ratio/min": 0.42222246527671814, "sampling/sampling_logp_difference/max": 0.8622229099273682, "sampling/sampling_logp_difference/mean": 0.015562876127660275, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 186.09375, "completions/mean_terminated_length": 186.09375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.36311158537864685, "epoch": 0.9656862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.025418103632889946, "kl": 0.02377057820558548, "learning_rate": 8.58678720826566e-07, "loss": 0.0003, "num_tokens": 24917659.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5889713764190674, "sampling/importance_sampling_ratio/mean": 0.9998500347137451, "sampling/importance_sampling_ratio/min": 0.6410223841667175, "sampling/sampling_logp_difference/max": 0.4630868434906006, "sampling/sampling_logp_difference/mean": 0.015514722093939781, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2629473805427551, "epoch": 0.9669117647058824, "frac_reward_zero_std": 1.0, "grad_norm": 0.016260778793600026, "kl": 0.017105624079704285, "learning_rate": 8.58182038509188e-07, "loss": 0.0002, "num_tokens": 24947947.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4748215675354004, "sampling/importance_sampling_ratio/mean": 0.9995681047439575, "sampling/importance_sampling_ratio/min": 0.346270352602005, "sampling/sampling_logp_difference/max": 1.0605354309082031, "sampling/sampling_logp_difference/mean": 0.011698717251420021, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3489529490470886, "epoch": 0.9681372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.04731401217274868, "kl": 0.032521091401576996, "learning_rate": 8.576846291049633e-07, "loss": 0.0003, "num_tokens": 24980255.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.652165174484253, "sampling/importance_sampling_ratio/mean": 1.0006591081619263, "sampling/importance_sampling_ratio/min": 0.643796443939209, "sampling/sampling_logp_difference/max": 0.5020866394042969, "sampling/sampling_logp_difference/mean": 0.01453758031129837, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 248.921875, "completions/mean_terminated_length": 248.921875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.45151424407958984, "epoch": 0.9693627450980392, "frac_reward_zero_std": 0.5, "grad_norm": 0.9283430316909879, "kl": 0.04374338686466217, "learning_rate": 8.571864936236015e-07, "loss": 0.0154, "num_tokens": 25010602.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4305888414382935, "sampling/importance_sampling_ratio/mean": 0.9994878768920898, "sampling/importance_sampling_ratio/min": 0.5685389041900635, "sampling/sampling_logp_difference/max": 0.5646854639053345, "sampling/sampling_logp_difference/mean": 0.017240433022379875, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 183.34375, "completions/mean_terminated_length": 183.34375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3219026029109955, "epoch": 0.9705882352941176, "frac_reward_zero_std": 1.0, "grad_norm": 0.023093056049612728, "kl": 0.022820979356765747, "learning_rate": 8.56687633076286e-07, "loss": 0.0002, "num_tokens": 25038688.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000104904174805, "sampling/importance_sampling_ratio/min": 0.43182510137557983, "sampling/sampling_logp_difference/max": 0.839734673500061, "sampling/sampling_logp_difference/mean": 0.01387720089405775, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 189.234375, "completions/mean_terminated_length": 189.234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.398162305355072, "epoch": 0.9718137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.02067089574625209, "kl": 0.023449556902050972, "learning_rate": 8.561880484756724e-07, "loss": 0.0002, "num_tokens": 25071135.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4607446193695068, "sampling/importance_sampling_ratio/mean": 0.9997786283493042, "sampling/importance_sampling_ratio/min": 0.6524137258529663, "sampling/sampling_logp_difference/max": 0.4270763397216797, "sampling/sampling_logp_difference/mean": 0.016811659559607506, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 182.90625, "completions/mean_terminated_length": 182.90625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3328895568847656, "epoch": 0.9730392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.02490222856414553, "kl": 0.022241603583097458, "learning_rate": 8.556877408358854e-07, "loss": 0.0002, "num_tokens": 25098921.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.656008243560791, "sampling/importance_sampling_ratio/mean": 0.9999021887779236, "sampling/importance_sampling_ratio/min": 0.5575329065322876, "sampling/sampling_logp_difference/max": 0.5842337608337402, "sampling/sampling_logp_difference/mean": 0.014693650417029858, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 204.21875, "completions/mean_terminated_length": 204.21875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4073524475097656, "epoch": 0.9742647058823529, "frac_reward_zero_std": 0.75, "grad_norm": 0.7699289479757312, "kl": 0.025464978069067, "learning_rate": 8.551867111725182e-07, "loss": -0.0142, "num_tokens": 25128439.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6670647859573364, "sampling/importance_sampling_ratio/mean": 0.9994465112686157, "sampling/importance_sampling_ratio/min": 0.6182374358177185, "sampling/sampling_logp_difference/max": 0.5110645294189453, "sampling/sampling_logp_difference/mean": 0.014799746684730053, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 209.609375, "completions/mean_terminated_length": 209.609375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.4414971172809601, "epoch": 0.9754901960784313, "frac_reward_zero_std": 1.0, "grad_norm": 0.021774394806298312, "kl": 0.02606060355901718, "learning_rate": 8.546849605026288e-07, "loss": 0.0002, "num_tokens": 25162974.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.507165551185608, "sampling/importance_sampling_ratio/mean": 0.9992042779922485, "sampling/importance_sampling_ratio/min": 0.6603471040725708, "sampling/sampling_logp_difference/max": 0.414989709854126, "sampling/sampling_logp_difference/mean": 0.017267197370529175, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.43835189938545227, "epoch": 0.9767156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.021780071068811264, "kl": 0.022342439740896225, "learning_rate": 8.541824898447397e-07, "loss": 0.0002, "num_tokens": 25198278.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8651736974716187, "sampling/importance_sampling_ratio/mean": 0.9994983077049255, "sampling/importance_sampling_ratio/min": 0.6086403727531433, "sampling/sampling_logp_difference/max": 0.6233541965484619, "sampling/sampling_logp_difference/mean": 0.01614305004477501, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 204.578125, "completions/mean_terminated_length": 204.578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.34380316734313965, "epoch": 0.9779411764705882, "frac_reward_zero_std": 0.75, "grad_norm": 0.9334916089133114, "kl": 0.021110976114869118, "learning_rate": 8.536793002188343e-07, "loss": -0.0265, "num_tokens": 25229195.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5910096168518066, "sampling/importance_sampling_ratio/mean": 0.9997698664665222, "sampling/importance_sampling_ratio/min": 0.5589753985404968, "sampling/sampling_logp_difference/max": 0.5816497802734375, "sampling/sampling_logp_difference/mean": 0.014641951769590378, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 190.953125, "completions/mean_terminated_length": 190.953125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3930814266204834, "epoch": 0.9791666666666666, "frac_reward_zero_std": 0.75, "grad_norm": 0.9708205281054662, "kl": 0.024711620062589645, "learning_rate": 8.531753926463556e-07, "loss": -0.0429, "num_tokens": 25266424.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.753625512123108, "sampling/importance_sampling_ratio/mean": 1.001275658607483, "sampling/importance_sampling_ratio/min": 0.4847486913204193, "sampling/sampling_logp_difference/max": 0.7241246700286865, "sampling/sampling_logp_difference/mean": 0.015727955847978592, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 246.328125, "completions/mean_terminated_length": 246.328125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.35948020219802856, "epoch": 0.9803921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.8200327497323995, "kl": 0.028046952560544014, "learning_rate": 8.526707681502043e-07, "loss": 0.0293, "num_tokens": 25310893.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.656794548034668, "sampling/importance_sampling_ratio/mean": 1.0002923011779785, "sampling/importance_sampling_ratio/min": 0.6141847968101501, "sampling/sampling_logp_difference/max": 0.5048847198486328, "sampling/sampling_logp_difference/mean": 0.013518155552446842, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3920966386795044, "epoch": 0.9816176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.031294687145535204, "kl": 0.025551265105605125, "learning_rate": 8.521654277547361e-07, "loss": 0.0003, "num_tokens": 25344293.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.602399230003357, "sampling/importance_sampling_ratio/mean": 0.9989451766014099, "sampling/importance_sampling_ratio/min": 0.654786229133606, "sampling/sampling_logp_difference/max": 0.47150206565856934, "sampling/sampling_logp_difference/mean": 0.016796045005321503, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 199.421875, "completions/mean_terminated_length": 199.421875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.37933820486068726, "epoch": 0.9828431372549019, "frac_reward_zero_std": 1.0, "grad_norm": 0.021350614162237914, "kl": 0.025682110339403152, "learning_rate": 8.516593724857597e-07, "loss": 0.0003, "num_tokens": 25375216.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3848679065704346, "sampling/importance_sampling_ratio/mean": 0.9996330738067627, "sampling/importance_sampling_ratio/min": 0.6501674652099609, "sampling/sampling_logp_difference/max": 0.4305253028869629, "sampling/sampling_logp_difference/mean": 0.015885349363088608, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 204.28125, "completions/mean_terminated_length": 204.28125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4139837622642517, "epoch": 0.9840686274509803, "frac_reward_zero_std": 1.0, "grad_norm": 0.036269306023235776, "kl": 0.02276715636253357, "learning_rate": 8.511526033705356e-07, "loss": 0.0002, "num_tokens": 25406194.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6009104251861572, "sampling/importance_sampling_ratio/mean": 0.9997035264968872, "sampling/importance_sampling_ratio/min": 0.5364245176315308, "sampling/sampling_logp_difference/max": 0.6228294372558594, "sampling/sampling_logp_difference/mean": 0.016889125108718872, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 210.828125, "completions/mean_terminated_length": 210.828125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.36640146374702454, "epoch": 0.9852941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0232377342843665, "kl": 0.023490116000175476, "learning_rate": 8.506451214377728e-07, "loss": 0.0002, "num_tokens": 25436359.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999101758003235, "sampling/importance_sampling_ratio/min": 0.5069254636764526, "sampling/sampling_logp_difference/max": 0.7274184226989746, "sampling/sampling_logp_difference/mean": 0.01511009968817234, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 160.4375, "completions/mean_terminated_length": 160.4375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3254278898239136, "epoch": 0.9865196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.02277267755169316, "kl": 0.025547131896018982, "learning_rate": 8.501369277176274e-07, "loss": 0.0003, "num_tokens": 25469027.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5311439037322998, "sampling/importance_sampling_ratio/mean": 0.99989914894104, "sampling/importance_sampling_ratio/min": 0.6171427965164185, "sampling/sampling_logp_difference/max": 0.4826548099517822, "sampling/sampling_logp_difference/mean": 0.014904014766216278, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 189.921875, "completions/mean_terminated_length": 189.921875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3373665511608124, "epoch": 0.9877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.030394630041370656, "kl": 0.028624843806028366, "learning_rate": 8.496280232417007e-07, "loss": 0.0003, "num_tokens": 25505374.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3983898162841797, "sampling/importance_sampling_ratio/mean": 0.9999194145202637, "sampling/importance_sampling_ratio/min": 0.6314087510108948, "sampling/sampling_logp_difference/max": 0.45980191230773926, "sampling/sampling_logp_difference/mean": 0.015270407311618328, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 199.1875, "completions/mean_terminated_length": 199.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3611469864845276, "epoch": 0.9889705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.024047604858430795, "kl": 0.02229381911456585, "learning_rate": 8.491184090430363e-07, "loss": 0.0002, "num_tokens": 25534474.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6014314889907837, "sampling/importance_sampling_ratio/mean": 0.9993778467178345, "sampling/importance_sampling_ratio/min": 0.721843421459198, "sampling/sampling_logp_difference/max": 0.470897912979126, "sampling/sampling_logp_difference/mean": 0.015921613201498985, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.36096253991127014, "epoch": 0.9901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.017139250348018616, "kl": 0.01766836643218994, "learning_rate": 8.48608086156119e-07, "loss": 0.0002, "num_tokens": 25570554.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7675029039382935, "sampling/importance_sampling_ratio/mean": 1.000484824180603, "sampling/importance_sampling_ratio/min": 0.5053761601448059, "sampling/sampling_logp_difference/max": 0.6824522018432617, "sampling/sampling_logp_difference/mean": 0.014950842596590519, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 195.21875, "completions/mean_terminated_length": 195.21875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2919182777404785, "epoch": 0.991421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.02545194783559473, "kl": 0.022783063352108, "learning_rate": 8.480970556168717e-07, "loss": 0.0002, "num_tokens": 25595112.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4753773212432861, "sampling/importance_sampling_ratio/mean": 0.9998204112052917, "sampling/importance_sampling_ratio/min": 0.6529889702796936, "sampling/sampling_logp_difference/max": 0.42619502544403076, "sampling/sampling_logp_difference/mean": 0.014107544906437397, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 269.34375, "completions/mean_terminated_length": 269.34375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.430225133895874, "epoch": 0.9926470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.7129722044749738, "kl": 0.023319419473409653, "learning_rate": 8.47585318462654e-07, "loss": 0.023, "num_tokens": 25632446.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4448902606964111, "sampling/importance_sampling_ratio/mean": 0.9993377923965454, "sampling/importance_sampling_ratio/min": 0.6666995882987976, "sampling/sampling_logp_difference/max": 0.40541577339172363, "sampling/sampling_logp_difference/mean": 0.01555697526782751, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 272.359375, "completions/mean_terminated_length": 272.359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.40380987524986267, "epoch": 0.9938725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.6766444865459755, "kl": 0.019705766811966896, "learning_rate": 8.470728757322603e-07, "loss": 0.0152, "num_tokens": 25671429.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0000799894332886, "sampling/importance_sampling_ratio/min": 0.6306539177894592, "sampling/sampling_logp_difference/max": 0.4609980583190918, "sampling/sampling_logp_difference/mean": 0.014769963920116425, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 147.265625, "completions/mean_terminated_length": 147.265625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.24114222824573517, "epoch": 0.9950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.03595364233249824, "kl": 0.02802179381251335, "learning_rate": 8.465597284659163e-07, "loss": 0.0003, "num_tokens": 25694054.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.475376844406128, "sampling/importance_sampling_ratio/mean": 0.9994142651557922, "sampling/importance_sampling_ratio/min": 0.6254650950431824, "sampling/sampling_logp_difference/max": 0.46925973892211914, "sampling/sampling_logp_difference/mean": 0.01176031120121479, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3158072233200073, "epoch": 0.9963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02249599983166481, "kl": 0.023762090131640434, "learning_rate": 8.460458777052788e-07, "loss": 0.0002, "num_tokens": 25724774.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998077750205994, "sampling/importance_sampling_ratio/min": 0.6465713977813721, "sampling/sampling_logp_difference/max": 1.22906494140625, "sampling/sampling_logp_difference/mean": 0.014211948961019516, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 244.6875, "completions/mean_terminated_length": 244.6875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.45422881841659546, "epoch": 0.9975490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.078847620590917, "kl": 0.03181261569261551, "learning_rate": 8.455313244934324e-07, "loss": 0.0243, "num_tokens": 25760546.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.6600011587142944, "sampling/importance_sampling_ratio/mean": 1.00004243850708, "sampling/importance_sampling_ratio/min": 0.7450798749923706, "sampling/sampling_logp_difference/max": 0.5068182945251465, "sampling/sampling_logp_difference/mean": 0.015185045078396797, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 246.359375, "completions/mean_terminated_length": 246.359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3757499158382416, "epoch": 0.9987745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.8332631864909773, "kl": 0.026064181700348854, "learning_rate": 8.450160698748871e-07, "loss": -0.0242, "num_tokens": 25792345.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5309606790542603, "sampling/importance_sampling_ratio/mean": 0.9993084073066711, "sampling/importance_sampling_ratio/min": 0.5552458167076111, "sampling/sampling_logp_difference/max": 0.5883443355560303, "sampling/sampling_logp_difference/mean": 0.01448429748415947, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 178.703125, "completions/mean_terminated_length": 178.703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4138263463973999, "epoch": 1.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.024616208636678072, "kl": 0.039309754967689514, "learning_rate": 8.445001148955775e-07, "loss": 0.0004, "num_tokens": 25818646.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071574449539185, "sampling/importance_sampling_ratio/mean": 1.0002918243408203, "sampling/importance_sampling_ratio/min": 0.6660245656967163, "sampling/sampling_logp_difference/max": 0.41022539138793945, "sampling/sampling_logp_difference/mean": 0.01580060087144375, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 222.328125, "completions/mean_terminated_length": 222.328125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.44479626417160034, "epoch": 1.0012254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.016823090218986002, "kl": 0.021708671003580093, "learning_rate": 8.439834606028593e-07, "loss": 0.0002, "num_tokens": 25852363.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4680591821670532, "sampling/importance_sampling_ratio/mean": 0.9998198747634888, "sampling/importance_sampling_ratio/min": 0.642697274684906, "sampling/sampling_logp_difference/max": 0.4420814514160156, "sampling/sampling_logp_difference/mean": 0.016688354313373566, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 237.9375, "completions/mean_terminated_length": 237.9375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4154745936393738, "epoch": 1.0024509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.7987049514767518, "kl": 0.03554980456829071, "learning_rate": 8.434661080455082e-07, "loss": 0.0063, "num_tokens": 25886503.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.621817946434021, "sampling/importance_sampling_ratio/mean": 1.0001156330108643, "sampling/importance_sampling_ratio/min": 0.6282684803009033, "sampling/sampling_logp_difference/max": 0.4835476875305176, "sampling/sampling_logp_difference/mean": 0.014652641490101814, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 168.46875, "completions/mean_terminated_length": 168.46875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.33490508794784546, "epoch": 1.0036764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.9270778190476674, "kl": 0.03322033956646919, "learning_rate": 8.42948058273717e-07, "loss": 0.0077, "num_tokens": 25910277.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.545121669769287, "sampling/importance_sampling_ratio/mean": 1.0001842975616455, "sampling/importance_sampling_ratio/min": 0.6543320417404175, "sampling/sampling_logp_difference/max": 0.4351027011871338, "sampling/sampling_logp_difference/mean": 0.01393546536564827, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3989688456058502, "epoch": 1.0049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.01897158266492704, "kl": 0.025341054424643517, "learning_rate": 8.424293123390938e-07, "loss": 0.0002, "num_tokens": 25940357.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6479039192199707, "sampling/importance_sampling_ratio/mean": 1.0001342296600342, "sampling/importance_sampling_ratio/min": 0.6151823997497559, "sampling/sampling_logp_difference/max": 0.49950408935546875, "sampling/sampling_logp_difference/mean": 0.01571480929851532, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 236.609375, "completions/mean_terminated_length": 236.609375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.5057919025421143, "epoch": 1.0061274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.8573369416986352, "kl": 0.032287076115608215, "learning_rate": 8.4190987129466e-07, "loss": -0.021, "num_tokens": 25974188.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.634234070777893, "sampling/importance_sampling_ratio/mean": 0.9997125864028931, "sampling/importance_sampling_ratio/min": 0.6482210755348206, "sampling/sampling_logp_difference/max": 0.49117422103881836, "sampling/sampling_logp_difference/mean": 0.016607385128736496, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 228.359375, "completions/mean_terminated_length": 228.359375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4514944851398468, "epoch": 1.0073529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.8506365877176593, "kl": 0.0204324908554554, "learning_rate": 8.413897361948483e-07, "loss": 0.0116, "num_tokens": 26006323.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5485343933105469, "sampling/importance_sampling_ratio/mean": 1.000069499015808, "sampling/importance_sampling_ratio/min": 0.7095987200737, "sampling/sampling_logp_difference/max": 0.4373089075088501, "sampling/sampling_logp_difference/mean": 0.015193624421954155, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 240.65625, "completions/mean_terminated_length": 240.65625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3850115239620209, "epoch": 1.008578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01656511101086973, "kl": 0.022681277245283127, "learning_rate": 8.408689080954997e-07, "loss": 0.0002, "num_tokens": 26042717.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3688677549362183, "sampling/importance_sampling_ratio/mean": 0.9996740818023682, "sampling/importance_sampling_ratio/min": 0.19988328218460083, "sampling/sampling_logp_difference/max": 1.610021710395813, "sampling/sampling_logp_difference/mean": 0.014235937967896461, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 224.65625, "completions/mean_terminated_length": 224.65625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3661452829837799, "epoch": 1.0098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02098756637086174, "kl": 0.028736956417560577, "learning_rate": 8.403473880538625e-07, "loss": 0.0003, "num_tokens": 26077175.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4588247537612915, "sampling/importance_sampling_ratio/mean": 0.9994629621505737, "sampling/importance_sampling_ratio/min": 0.6147721409797668, "sampling/sampling_logp_difference/max": 0.48650360107421875, "sampling/sampling_logp_difference/mean": 0.01425854954868555, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4839499294757843, "epoch": 1.0110294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 1.0668102894492004, "kl": 0.035447005182504654, "learning_rate": 8.398251771285892e-07, "loss": -0.0105, "num_tokens": 26112863.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5414327383041382, "sampling/importance_sampling_ratio/mean": 0.999983012676239, "sampling/importance_sampling_ratio/min": 0.7225062847137451, "sampling/sampling_logp_difference/max": 0.4327123165130615, "sampling/sampling_logp_difference/mean": 0.017018748447299004, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 262.1875, "completions/mean_terminated_length": 262.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.437021404504776, "epoch": 1.0122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.820264979193408, "kl": 0.025450553745031357, "learning_rate": 8.393022763797346e-07, "loss": 0.0001, "num_tokens": 26144827.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4006553888320923, "sampling/importance_sampling_ratio/mean": 1.0000797510147095, "sampling/importance_sampling_ratio/min": 0.6503985524177551, "sampling/sampling_logp_difference/max": 0.430169939994812, "sampling/sampling_logp_difference/mean": 0.015213541686534882, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 164.296875, "completions/mean_terminated_length": 164.296875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3759983777999878, "epoch": 1.0134803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7982833386554936, "kl": 0.05652917921543121, "learning_rate": 8.387786868687548e-07, "loss": -0.0012, "num_tokens": 26167262.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3700846433639526, "sampling/importance_sampling_ratio/mean": 0.9994276762008667, "sampling/importance_sampling_ratio/min": 0.7327985167503357, "sampling/sampling_logp_difference/max": 0.31487250328063965, "sampling/sampling_logp_difference/mean": 0.014417910017073154, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 287.203125, "completions/mean_terminated_length": 287.203125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.5151238441467285, "epoch": 1.0147058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0133548511461008, "kl": 0.030357833951711655, "learning_rate": 8.382544096585026e-07, "loss": -0.0312, "num_tokens": 26200907.0, "reward": 0.1875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7406495809555054, "sampling/importance_sampling_ratio/mean": 0.9997864365577698, "sampling/importance_sampling_ratio/min": 0.5055920481681824, "sampling/sampling_logp_difference/max": 0.6820251941680908, "sampling/sampling_logp_difference/mean": 0.016376610845327377, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 219.203125, "completions/mean_terminated_length": 219.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.44265711307525635, "epoch": 1.0159313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.018610019581102484, "kl": 0.02551637962460518, "learning_rate": 8.37729445813228e-07, "loss": 0.0003, "num_tokens": 26235112.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.465740442276001, "sampling/importance_sampling_ratio/mean": 1.0003550052642822, "sampling/importance_sampling_ratio/min": 0.6710967421531677, "sampling/sampling_logp_difference/max": 0.3988419771194458, "sampling/sampling_logp_difference/mean": 0.014949493110179901, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 235.140625, "completions/mean_terminated_length": 235.140625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.42499053478240967, "epoch": 1.017156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.01943161888339381, "kl": 0.025752726942300797, "learning_rate": 8.372037963985741e-07, "loss": 0.0002, "num_tokens": 26273297.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3642884492874146, "sampling/importance_sampling_ratio/mean": 0.9999311566352844, "sampling/importance_sampling_ratio/min": 0.5483862161636353, "sampling/sampling_logp_difference/max": 0.6007754802703857, "sampling/sampling_logp_difference/mean": 0.015527062118053436, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 271.234375, "completions/mean_terminated_length": 271.234375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3281579613685608, "epoch": 1.0183823529411764, "frac_reward_zero_std": 0.5, "grad_norm": 0.8457087183973668, "kl": 0.030670788139104843, "learning_rate": 8.366774624815761e-07, "loss": -0.103, "num_tokens": 26311648.0, "reward": 0.4375, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.293837308883667, "sampling/importance_sampling_ratio/mean": 1.000064730644226, "sampling/importance_sampling_ratio/min": 0.6780655384063721, "sampling/sampling_logp_difference/max": 0.3885113000869751, "sampling/sampling_logp_difference/mean": 0.012746588326990604, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 182.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.38060179352760315, "epoch": 1.0196078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 1.1358667089034782, "kl": 0.02932559885084629, "learning_rate": 8.361504451306584e-07, "loss": -0.0295, "num_tokens": 26345548.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2997032403945923, "sampling/importance_sampling_ratio/mean": 0.9998878836631775, "sampling/importance_sampling_ratio/min": 0.48163238167762756, "sampling/sampling_logp_difference/max": 0.7305741310119629, "sampling/sampling_logp_difference/mean": 0.014157561585307121, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 235.484375, "completions/mean_terminated_length": 235.484375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.40709584951400757, "epoch": 1.0208333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.8908409729496476, "kl": 0.028004394844174385, "learning_rate": 8.356227454156328e-07, "loss": 0.0039, "num_tokens": 26376523.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4950038194656372, "sampling/importance_sampling_ratio/mean": 0.9999670386314392, "sampling/importance_sampling_ratio/min": 0.703667402267456, "sampling/sampling_logp_difference/max": 0.40212881565093994, "sampling/sampling_logp_difference/mean": 0.015612028539180756, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 217.65625, "completions/mean_terminated_length": 217.65625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4033668339252472, "epoch": 1.0220588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.02725360252468977, "kl": 0.03612522780895233, "learning_rate": 8.350943644076964e-07, "loss": 0.0004, "num_tokens": 26406869.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4186195135116577, "sampling/importance_sampling_ratio/mean": 0.9994950294494629, "sampling/importance_sampling_ratio/min": 0.6725078225135803, "sampling/sampling_logp_difference/max": 0.39674150943756104, "sampling/sampling_logp_difference/mean": 0.014155510812997818, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3583762049674988, "epoch": 1.0232843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.02149560434094024, "kl": 0.028283949941396713, "learning_rate": 8.34565303179429e-07, "loss": 0.0003, "num_tokens": 26434141.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3261923789978027, "sampling/importance_sampling_ratio/mean": 1.0003596544265747, "sampling/importance_sampling_ratio/min": 0.7813834547996521, "sampling/sampling_logp_difference/max": 0.2823120355606079, "sampling/sampling_logp_difference/mean": 0.013688826002180576, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 228.828125, "completions/mean_terminated_length": 228.828125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.49970585107803345, "epoch": 1.0245098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.3938016731403131, "kl": 0.08226551115512848, "learning_rate": 8.340355628047917e-07, "loss": -0.0592, "num_tokens": 26467666.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4455267190933228, "sampling/importance_sampling_ratio/mean": 1.000573992729187, "sampling/importance_sampling_ratio/min": 0.6101624965667725, "sampling/sampling_logp_difference/max": 0.4940299987792969, "sampling/sampling_logp_difference/mean": 0.017299897968769073, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 221.09375, "completions/mean_terminated_length": 221.09375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5714606046676636, "epoch": 1.025735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.026558899090748923, "kl": 0.04937228932976723, "learning_rate": 8.335051443591234e-07, "loss": 0.0005, "num_tokens": 26500936.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4323515892028809, "sampling/importance_sampling_ratio/mean": 1.0003485679626465, "sampling/importance_sampling_ratio/min": 0.7121589183807373, "sampling/sampling_logp_difference/max": 0.3593175411224365, "sampling/sampling_logp_difference/mean": 0.01874801144003868, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 209.3125, "completions/mean_terminated_length": 209.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3629034161567688, "epoch": 1.0269607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.016865839137336323, "kl": 0.024588044732809067, "learning_rate": 8.329740489191405e-07, "loss": 0.0002, "num_tokens": 26529948.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5092782974243164, "sampling/importance_sampling_ratio/mean": 1.0002012252807617, "sampling/importance_sampling_ratio/min": 0.6888720989227295, "sampling/sampling_logp_difference/max": 0.41163158416748047, "sampling/sampling_logp_difference/mean": 0.013675286434590816, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 269.453125, "completions/mean_terminated_length": 269.453125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.5257227420806885, "epoch": 1.0281862745098038, "frac_reward_zero_std": 0.25, "grad_norm": 1.1266783710781607, "kl": 0.06060445308685303, "learning_rate": 8.324422775629327e-07, "loss": -0.0037, "num_tokens": 26569433.0, "reward": 0.53125, "reward_std": 0.519389271736145, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4926996231079102, "sampling/importance_sampling_ratio/mean": 0.9997835755348206, "sampling/importance_sampling_ratio/min": 0.6224827170372009, "sampling/sampling_logp_difference/max": 0.4740394353866577, "sampling/sampling_logp_difference/mean": 0.01740849018096924, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 159.71875, "completions/mean_terminated_length": 159.71875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.39167678356170654, "epoch": 1.0294117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.02078355836308601, "kl": 0.02990805357694626, "learning_rate": 8.319098313699624e-07, "loss": 0.0003, "num_tokens": 26599079.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.368140459060669, "sampling/importance_sampling_ratio/mean": 0.9998688101768494, "sampling/importance_sampling_ratio/min": 0.6918001174926758, "sampling/sampling_logp_difference/max": 0.36845827102661133, "sampling/sampling_logp_difference/mean": 0.015625260770320892, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 221.078125, "completions/mean_terminated_length": 221.078125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.44650688767433167, "epoch": 1.0306372549019607, "frac_reward_zero_std": 0.75, "grad_norm": 0.8699635035829281, "kl": 0.03556312993168831, "learning_rate": 8.313767114210615e-07, "loss": 0.0255, "num_tokens": 26640492.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.722933292388916, "sampling/importance_sampling_ratio/mean": 0.9996669888496399, "sampling/importance_sampling_ratio/min": 0.44537386298179626, "sampling/sampling_logp_difference/max": 0.8088412284851074, "sampling/sampling_logp_difference/mean": 0.016315573826432228, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3267580270767212, "epoch": 1.031862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.020172033230415685, "kl": 0.031396303325891495, "learning_rate": 8.308429187984298e-07, "loss": 0.0003, "num_tokens": 26666036.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.559213399887085, "sampling/importance_sampling_ratio/mean": 1.000575304031372, "sampling/importance_sampling_ratio/min": 0.6239210367202759, "sampling/sampling_logp_difference/max": 0.47173142433166504, "sampling/sampling_logp_difference/mean": 0.013954175636172295, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 230.296875, "completions/mean_terminated_length": 230.296875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.46382033824920654, "epoch": 1.0330882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.8635833195747631, "kl": 0.02818339504301548, "learning_rate": 8.303084545856322e-07, "loss": 0.0231, "num_tokens": 26706951.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.2970181703567505, "sampling/importance_sampling_ratio/mean": 0.999815046787262, "sampling/importance_sampling_ratio/min": 0.7092636227607727, "sampling/sampling_logp_difference/max": 0.34352803230285645, "sampling/sampling_logp_difference/mean": 0.014819949865341187, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.44449472427368164, "epoch": 1.0343137254901962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0177674076373009, "kl": 0.02924432046711445, "learning_rate": 8.297733198675977e-07, "loss": 0.0003, "num_tokens": 26740691.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.601992130279541, "sampling/importance_sampling_ratio/mean": 1.0001962184906006, "sampling/importance_sampling_ratio/min": 0.7089138031005859, "sampling/sampling_logp_difference/max": 0.47124791145324707, "sampling/sampling_logp_difference/mean": 0.01642962172627449, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 181.34375, "completions/mean_terminated_length": 181.34375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.5077136754989624, "epoch": 1.0355392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.019854642900428564, "kl": 0.031858205795288086, "learning_rate": 8.292375157306155e-07, "loss": 0.0003, "num_tokens": 26772233.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3731887340545654, "sampling/importance_sampling_ratio/mean": 1.000213623046875, "sampling/importance_sampling_ratio/min": 0.6937328577041626, "sampling/sampling_logp_difference/max": 0.36566829681396484, "sampling/sampling_logp_difference/mean": 0.017039429396390915, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 180.1875, "completions/mean_terminated_length": 180.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.41495281457901, "epoch": 1.036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.026011467757728685, "kl": 0.058533675968647, "learning_rate": 8.287010432623343e-07, "loss": 0.0005, "num_tokens": 26799365.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7040212154388428, "sampling/importance_sampling_ratio/mean": 0.9999061226844788, "sampling/importance_sampling_ratio/min": 0.617724597454071, "sampling/sampling_logp_difference/max": 0.5329909324645996, "sampling/sampling_logp_difference/mean": 0.016645336523652077, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.4212398827075958, "epoch": 1.0379901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8988033018816445, "kl": 0.0370466485619545, "learning_rate": 8.281639035517591e-07, "loss": -0.0099, "num_tokens": 26824513.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.515293836593628, "sampling/importance_sampling_ratio/mean": 0.9997961521148682, "sampling/importance_sampling_ratio/min": 0.7043558955192566, "sampling/sampling_logp_difference/max": 0.41560935974121094, "sampling/sampling_logp_difference/mean": 0.016113050282001495, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 171.34375, "completions/mean_terminated_length": 171.34375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.37584012746810913, "epoch": 1.0392156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.04717020021178661, "kl": 0.03277963399887085, "learning_rate": 8.276260976892495e-07, "loss": 0.0003, "num_tokens": 26858087.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3748070001602173, "sampling/importance_sampling_ratio/mean": 1.0005311965942383, "sampling/importance_sampling_ratio/min": 0.6475529074668884, "sampling/sampling_logp_difference/max": 0.4345548152923584, "sampling/sampling_logp_difference/mean": 0.016502805054187775, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.47122788429260254, "epoch": 1.0404411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.018437721552177645, "kl": 0.028479089960455894, "learning_rate": 8.270876267665173e-07, "loss": 0.0003, "num_tokens": 26891211.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4462474584579468, "sampling/importance_sampling_ratio/mean": 1.000338077545166, "sampling/importance_sampling_ratio/min": 0.703247606754303, "sampling/sampling_logp_difference/max": 0.3689723014831543, "sampling/sampling_logp_difference/mean": 0.015550898388028145, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3868410587310791, "epoch": 1.0416666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.7911738619881935, "kl": 0.03665817528963089, "learning_rate": 8.265484918766242e-07, "loss": 0.0231, "num_tokens": 26915495.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.388287901878357, "sampling/importance_sampling_ratio/mean": 0.9996983408927917, "sampling/importance_sampling_ratio/min": 0.7110201120376587, "sampling/sampling_logp_difference/max": 0.34105461835861206, "sampling/sampling_logp_difference/mean": 0.014347558841109276, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 162.546875, "completions/mean_terminated_length": 162.546875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.45995205640792847, "epoch": 1.0428921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 1.0248271739019652, "kl": 0.03669149428606033, "learning_rate": 8.260086941139804e-07, "loss": 0.0013, "num_tokens": 26949210.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5009289979934692, "sampling/importance_sampling_ratio/mean": 0.9992746114730835, "sampling/importance_sampling_ratio/min": 0.781379222869873, "sampling/sampling_logp_difference/max": 0.4060842990875244, "sampling/sampling_logp_difference/mean": 0.016376223415136337, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 195.765625, "completions/mean_terminated_length": 195.765625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.5225400924682617, "epoch": 1.0441176470588236, "frac_reward_zero_std": 0.25, "grad_norm": 1.351549946282612, "kl": 0.059774432331323624, "learning_rate": 8.254682345743405e-07, "loss": -0.0329, "num_tokens": 26978331.0, "reward": 0.65625, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.2851217985153198, "sampling/importance_sampling_ratio/mean": 0.999582052230835, "sampling/importance_sampling_ratio/min": 0.6393632292747498, "sampling/sampling_logp_difference/max": 0.4472825527191162, "sampling/sampling_logp_difference/mean": 0.018038488924503326, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 227.421875, "completions/mean_terminated_length": 227.421875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4862522482872009, "epoch": 1.045343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6804139902347937, "kl": 0.03782299533486366, "learning_rate": 8.249271143548036e-07, "loss": 0.0221, "num_tokens": 27012870.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4067710638046265, "sampling/importance_sampling_ratio/mean": 0.9999182820320129, "sampling/importance_sampling_ratio/min": 0.6331161856651306, "sampling/sampling_logp_difference/max": 0.45710134506225586, "sampling/sampling_logp_difference/mean": 0.015550434589385986, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 196.390625, "completions/mean_terminated_length": 196.390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.45215368270874023, "epoch": 1.0465686274509804, "frac_reward_zero_std": 0.25, "grad_norm": 1.4462448482716836, "kl": 0.03857942670583725, "learning_rate": 8.243853345538093e-07, "loss": 0.0486, "num_tokens": 27048175.0, "reward": 0.6875, "reward_std": 0.551956295967102, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6944257020950317, "sampling/importance_sampling_ratio/mean": 0.9998030066490173, "sampling/importance_sampling_ratio/min": 0.7130687832832336, "sampling/sampling_logp_difference/max": 0.5273438692092896, "sampling/sampling_logp_difference/mean": 0.01590794324874878, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 213.828125, "completions/mean_terminated_length": 213.828125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3685460686683655, "epoch": 1.0477941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.8596830434152778, "kl": 0.035765089094638824, "learning_rate": 8.238428962711362e-07, "loss": -0.004, "num_tokens": 27079172.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5653468370437622, "sampling/importance_sampling_ratio/mean": 0.9990967512130737, "sampling/importance_sampling_ratio/min": 0.4404228627681732, "sampling/sampling_logp_difference/max": 0.8200199604034424, "sampling/sampling_logp_difference/mean": 0.013786762952804565, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 201.640625, "completions/mean_terminated_length": 201.640625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4726966917514801, "epoch": 1.0490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.04732635726139525, "kl": 0.036044515669345856, "learning_rate": 8.232998006078997e-07, "loss": 0.0004, "num_tokens": 27112253.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4486273527145386, "sampling/importance_sampling_ratio/mean": 1.000450849533081, "sampling/importance_sampling_ratio/min": 0.6108221411705017, "sampling/sampling_logp_difference/max": 0.4929494857788086, "sampling/sampling_logp_difference/mean": 0.015688953921198845, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 182.015625, "completions/mean_terminated_length": 182.015625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3683471977710724, "epoch": 1.0502450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.020066894840116995, "kl": 0.03780883550643921, "learning_rate": 8.227560486665498e-07, "loss": 0.0004, "num_tokens": 27141054.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4763715267181396, "sampling/importance_sampling_ratio/mean": 0.9994968771934509, "sampling/importance_sampling_ratio/min": 0.7207610607147217, "sampling/sampling_logp_difference/max": 0.38958740234375, "sampling/sampling_logp_difference/mean": 0.013770157471299171, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.48345518112182617, "epoch": 1.0514705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.9422495834300965, "kl": 0.05197905749082565, "learning_rate": 8.222116415508682e-07, "loss": 0.0267, "num_tokens": 27167206.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3234844207763672, "sampling/importance_sampling_ratio/mean": 1.0000250339508057, "sampling/importance_sampling_ratio/min": 0.6663122773170471, "sampling/sampling_logp_difference/max": 0.40599679946899414, "sampling/sampling_logp_difference/mean": 0.0172736719250679, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 189.859375, "completions/mean_terminated_length": 189.859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.40405958890914917, "epoch": 1.0526960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.02064729612060633, "kl": 0.03284838795661926, "learning_rate": 8.21666580365967e-07, "loss": 0.0003, "num_tokens": 27202957.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.472433090209961, "sampling/importance_sampling_ratio/mean": 1.000199317932129, "sampling/importance_sampling_ratio/min": 0.6393764019012451, "sampling/sampling_logp_difference/max": 0.4472620487213135, "sampling/sampling_logp_difference/mean": 0.014828795567154884, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 211.03125, "completions/mean_terminated_length": 211.03125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.5035507678985596, "epoch": 1.053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.021765292983725287, "kl": 0.052284855395555496, "learning_rate": 8.211208662182858e-07, "loss": 0.0004, "num_tokens": 27237583.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002892017364502, "sampling/importance_sampling_ratio/min": 0.7212308645248413, "sampling/sampling_logp_difference/max": 0.8912079334259033, "sampling/sampling_logp_difference/mean": 0.016904963180422783, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 173.359375, "completions/mean_terminated_length": 173.359375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.5036715269088745, "epoch": 1.0551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.183793972805832, "kl": 0.05257159471511841, "learning_rate": 8.205745002155899e-07, "loss": -0.1866, "num_tokens": 27267126.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.383525013923645, "sampling/importance_sampling_ratio/mean": 0.9996995329856873, "sampling/importance_sampling_ratio/min": 0.6777058839797974, "sampling/sampling_logp_difference/max": 0.3890419006347656, "sampling/sampling_logp_difference/mean": 0.01687629148364067, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 188.578125, "completions/mean_terminated_length": 188.578125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4505173861980438, "epoch": 1.0563725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.02493041736236707, "kl": 0.04182557761669159, "learning_rate": 8.200274834669675e-07, "loss": 0.0004, "num_tokens": 27294459.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4025495052337646, "sampling/importance_sampling_ratio/mean": 0.9999800324440002, "sampling/importance_sampling_ratio/min": 0.6474964022636414, "sampling/sampling_logp_difference/max": 0.43464207649230957, "sampling/sampling_logp_difference/mean": 0.015560134314000607, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4979953169822693, "epoch": 1.0575980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.03040486590745786, "kl": 0.050923287868499756, "learning_rate": 8.194798170828279e-07, "loss": 0.0005, "num_tokens": 27326403.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6092044115066528, "sampling/importance_sampling_ratio/mean": 0.999182939529419, "sampling/importance_sampling_ratio/min": 0.7245404720306396, "sampling/sampling_logp_difference/max": 0.4757399559020996, "sampling/sampling_logp_difference/mean": 0.016028741374611855, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 177.40625, "completions/mean_terminated_length": 177.40625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4137076735496521, "epoch": 1.0588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02850708317913002, "kl": 0.03606805577874184, "learning_rate": 8.189315021748993e-07, "loss": 0.0004, "num_tokens": 27354621.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5470267534255981, "sampling/importance_sampling_ratio/mean": 1.0001707077026367, "sampling/importance_sampling_ratio/min": 0.6292845606803894, "sampling/sampling_logp_difference/max": 0.46317172050476074, "sampling/sampling_logp_difference/mean": 0.0135587677359581, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 217.140625, "completions/mean_terminated_length": 217.140625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.43804386258125305, "epoch": 1.0600490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.019411988475436792, "kl": 0.03151471167802811, "learning_rate": 8.183825398562263e-07, "loss": 0.0003, "num_tokens": 27386310.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3624889850616455, "sampling/importance_sampling_ratio/mean": 0.9998658895492554, "sampling/importance_sampling_ratio/min": 0.562029242515564, "sampling/sampling_logp_difference/max": 0.5762014389038086, "sampling/sampling_logp_difference/mean": 0.014450537040829659, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 139.828125, "completions/mean_terminated_length": 139.828125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5208714008331299, "epoch": 1.0612745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.03417919040874343, "kl": 0.06288649886846542, "learning_rate": 8.178329312411676e-07, "loss": 0.0006, "num_tokens": 27413643.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.364408016204834, "sampling/importance_sampling_ratio/mean": 1.0002866983413696, "sampling/importance_sampling_ratio/min": 0.6147370934486389, "sampling/sampling_logp_difference/max": 0.486560583114624, "sampling/sampling_logp_difference/mean": 0.017352543771266937, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 199.515625, "completions/mean_terminated_length": 199.515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4414290189743042, "epoch": 1.0625, "frac_reward_zero_std": 1.0, "grad_norm": 0.02473058072426531, "kl": 0.04589700326323509, "learning_rate": 8.172826774453936e-07, "loss": 0.0004, "num_tokens": 27438748.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3940861225128174, "sampling/importance_sampling_ratio/mean": 1.0002847909927368, "sampling/importance_sampling_ratio/min": 0.6868340969085693, "sampling/sampling_logp_difference/max": 0.3756624460220337, "sampling/sampling_logp_difference/mean": 0.014572693035006523, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.6011121273040771, "epoch": 1.0637254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.0148178383460087, "kl": 0.051899440586566925, "learning_rate": 8.16731779585885e-07, "loss": -0.1049, "num_tokens": 27475528.0, "reward": -0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5088313817977905, "sampling/importance_sampling_ratio/mean": 0.9997458457946777, "sampling/importance_sampling_ratio/min": 0.7129045724868774, "sampling/sampling_logp_difference/max": 0.41133546829223633, "sampling/sampling_logp_difference/mean": 0.019103452563285828, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 203.078125, "completions/mean_terminated_length": 203.078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.5007017850875854, "epoch": 1.0649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02232365206301372, "kl": 0.04034339264035225, "learning_rate": 8.161802387809292e-07, "loss": 0.0004, "num_tokens": 27506813.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4619460105895996, "sampling/importance_sampling_ratio/mean": 0.9996985793113708, "sampling/importance_sampling_ratio/min": 0.6675713062286377, "sampling/sampling_logp_difference/max": 0.40410900115966797, "sampling/sampling_logp_difference/mean": 0.01666172593832016, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 255.515625, "completions/mean_terminated_length": 255.515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4940684139728546, "epoch": 1.0661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.023715901489987402, "kl": 0.03826040029525757, "learning_rate": 8.156280561501194e-07, "loss": 0.0004, "num_tokens": 27546030.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5691092014312744, "sampling/importance_sampling_ratio/mean": 0.9998692870140076, "sampling/importance_sampling_ratio/min": 0.6319237947463989, "sampling/sampling_logp_difference/max": 0.4589865207672119, "sampling/sampling_logp_difference/mean": 0.015382746234536171, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 206.1875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4447386860847473, "epoch": 1.0674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.021946825285994783, "kl": 0.040563203394412994, "learning_rate": 8.150752328143513e-07, "loss": 0.0004, "num_tokens": 27579738.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.34888756275177, "sampling/importance_sampling_ratio/mean": 0.9999980926513672, "sampling/importance_sampling_ratio/min": 0.5510970950126648, "sampling/sampling_logp_difference/max": 0.5958442687988281, "sampling/sampling_logp_difference/mean": 0.014817440882325172, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 250.890625, "completions/mean_terminated_length": 250.890625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.5305104851722717, "epoch": 1.0686274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.02157410289594113, "kl": 0.04023445397615433, "learning_rate": 8.145217698958211e-07, "loss": 0.0004, "num_tokens": 27612931.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.401291012763977, "sampling/importance_sampling_ratio/mean": 1.0001248121261597, "sampling/importance_sampling_ratio/min": 0.6908958554267883, "sampling/sampling_logp_difference/max": 0.3697662353515625, "sampling/sampling_logp_difference/mean": 0.015897506847977638, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 192.578125, "completions/mean_terminated_length": 192.578125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.452565997838974, "epoch": 1.0698529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.021513085202711633, "kl": 0.04016255587339401, "learning_rate": 8.139676685180236e-07, "loss": 0.0004, "num_tokens": 27641848.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5309851169586182, "sampling/importance_sampling_ratio/mean": 0.9997951984405518, "sampling/importance_sampling_ratio/min": 0.6802050471305847, "sampling/sampling_logp_difference/max": 0.42591142654418945, "sampling/sampling_logp_difference/mean": 0.013624733313918114, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 219.296875, "completions/mean_terminated_length": 219.296875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.38810229301452637, "epoch": 1.071078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01823538234640203, "kl": 0.03198229894042015, "learning_rate": 8.134129298057495e-07, "loss": 0.0003, "num_tokens": 27673723.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4551739692687988, "sampling/importance_sampling_ratio/mean": 1.0002098083496094, "sampling/importance_sampling_ratio/min": 0.6292175054550171, "sampling/sampling_logp_difference/max": 0.46327829360961914, "sampling/sampling_logp_difference/mean": 0.013081250712275505, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 199.984375, "completions/mean_terminated_length": 199.984375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.47072750329971313, "epoch": 1.0723039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.027011691257969865, "kl": 0.03940771520137787, "learning_rate": 8.128575548850832e-07, "loss": 0.0004, "num_tokens": 27702010.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.458251714706421, "sampling/importance_sampling_ratio/mean": 0.9996449947357178, "sampling/importance_sampling_ratio/min": 0.6385555267333984, "sampling/sampling_logp_difference/max": 0.4485466480255127, "sampling/sampling_logp_difference/mean": 0.015245177783071995, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 192.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5234087705612183, "epoch": 1.0735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.023714850434828398, "kl": 0.043784648180007935, "learning_rate": 8.123015448834005e-07, "loss": 0.0004, "num_tokens": 27733526.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4594935178756714, "sampling/importance_sampling_ratio/mean": 1.0002353191375732, "sampling/importance_sampling_ratio/min": 0.6867315769195557, "sampling/sampling_logp_difference/max": 0.37808942794799805, "sampling/sampling_logp_difference/mean": 0.016243984922766685, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 231.734375, "completions/mean_terminated_length": 231.734375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4006674587726593, "epoch": 1.0747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.6290842036901704, "kl": 0.051520030945539474, "learning_rate": 8.117449009293668e-07, "loss": 0.0082, "num_tokens": 27763413.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2746437788009644, "sampling/importance_sampling_ratio/mean": 0.9999599456787109, "sampling/importance_sampling_ratio/min": 0.7638634443283081, "sampling/sampling_logp_difference/max": 0.2693662643432617, "sampling/sampling_logp_difference/mean": 0.013918805867433548, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 215.78125, "completions/mean_terminated_length": 215.78125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4207499325275421, "epoch": 1.0759803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7575280902031539, "kl": 0.044188033789396286, "learning_rate": 8.111876241529337e-07, "loss": -0.0273, "num_tokens": 27794839.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6420798301696777, "sampling/importance_sampling_ratio/mean": 1.0000677108764648, "sampling/importance_sampling_ratio/min": 0.6742939352989197, "sampling/sampling_logp_difference/max": 0.49596357345581055, "sampling/sampling_logp_difference/mean": 0.014050956815481186, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 211.890625, "completions/mean_terminated_length": 211.890625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.49590641260147095, "epoch": 1.0772058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.7160266551808369, "kl": 0.059609800577163696, "learning_rate": 8.106297156853379e-07, "loss": 0.0037, "num_tokens": 27824096.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5287641286849976, "sampling/importance_sampling_ratio/mean": 1.0005055665969849, "sampling/importance_sampling_ratio/min": 0.6962414383888245, "sampling/sampling_logp_difference/max": 0.42445969581604004, "sampling/sampling_logp_difference/mean": 0.01589365303516388, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 236.71875, "completions/mean_terminated_length": 236.71875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4957412779331207, "epoch": 1.0784313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7651120974169411, "kl": 0.039259374141693115, "learning_rate": 8.100711766590982e-07, "loss": -0.0129, "num_tokens": 27857598.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4374808073043823, "sampling/importance_sampling_ratio/mean": 1.0001131296157837, "sampling/importance_sampling_ratio/min": 0.7092178463935852, "sampling/sampling_logp_difference/max": 0.36289215087890625, "sampling/sampling_logp_difference/mean": 0.015798121690750122, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 156.265625, "completions/mean_terminated_length": 156.265625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.38593214750289917, "epoch": 1.079656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.027970369726454135, "kl": 0.041295409202575684, "learning_rate": 8.095120082080134e-07, "loss": 0.0004, "num_tokens": 27883247.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3590514659881592, "sampling/importance_sampling_ratio/mean": 0.9997127056121826, "sampling/importance_sampling_ratio/min": 0.6409491300582886, "sampling/sampling_logp_difference/max": 0.4448051452636719, "sampling/sampling_logp_difference/mean": 0.014706568792462349, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 213.640625, "completions/mean_terminated_length": 213.640625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.5414191484451294, "epoch": 1.0808823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.025291530606162897, "kl": 0.050102598965168, "learning_rate": 8.089522114671602e-07, "loss": 0.0005, "num_tokens": 27918648.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4569766521453857, "sampling/importance_sampling_ratio/mean": 1.0000700950622559, "sampling/importance_sampling_ratio/min": 0.6865126490592957, "sampling/sampling_logp_difference/max": 0.37636351585388184, "sampling/sampling_logp_difference/mean": 0.01677125319838524, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 220.421875, "completions/mean_terminated_length": 220.421875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.5107558965682983, "epoch": 1.0821078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.02311364768778726, "kl": 0.03638807311654091, "learning_rate": 8.083917875728905e-07, "loss": 0.0004, "num_tokens": 27952627.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.629607915878296, "sampling/importance_sampling_ratio/mean": 0.9997633099555969, "sampling/importance_sampling_ratio/min": 0.635168731212616, "sampling/sampling_logp_difference/max": 0.4883394241333008, "sampling/sampling_logp_difference/mean": 0.017013484612107277, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4580460488796234, "epoch": 1.0833333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.7049973427062836, "kl": 0.04482467472553253, "learning_rate": 8.07830737662829e-07, "loss": 0.0085, "num_tokens": 27985251.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5376219749450684, "sampling/importance_sampling_ratio/mean": 0.9997838139533997, "sampling/importance_sampling_ratio/min": 0.6243430376052856, "sampling/sampling_logp_difference/max": 0.471055269241333, "sampling/sampling_logp_difference/mean": 0.014974001795053482, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 158.34375, "completions/mean_terminated_length": 158.34375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.45414239168167114, "epoch": 1.0845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.019143579287490053, "kl": 0.03195573389530182, "learning_rate": 8.072690628758721e-07, "loss": 0.0003, "num_tokens": 28013273.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.356048822402954, "sampling/importance_sampling_ratio/mean": 1.0004680156707764, "sampling/importance_sampling_ratio/min": 0.7771909832954407, "sampling/sampling_logp_difference/max": 0.30457520484924316, "sampling/sampling_logp_difference/mean": 0.015054703690111637, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 209.828125, "completions/mean_terminated_length": 209.828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.353268563747406, "epoch": 1.0857843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.014608857233496962, "kl": 0.02390231378376484, "learning_rate": 8.067067643521833e-07, "loss": 0.0002, "num_tokens": 28043390.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5793465375900269, "sampling/importance_sampling_ratio/mean": 1.000211477279663, "sampling/importance_sampling_ratio/min": 0.695388674736023, "sampling/sampling_logp_difference/max": 0.45701122283935547, "sampling/sampling_logp_difference/mean": 0.012544278055429459, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 260.5625, "completions/mean_terminated_length": 260.5625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4316437244415283, "epoch": 1.0870098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.016694614006907024, "kl": 0.028756748884916306, "learning_rate": 8.061438432331934e-07, "loss": 0.0003, "num_tokens": 28079250.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001678466796875, "sampling/importance_sampling_ratio/min": 0.6582571864128113, "sampling/sampling_logp_difference/max": 0.8504958152770996, "sampling/sampling_logp_difference/mean": 0.01411934569478035, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 222.15625, "completions/mean_terminated_length": 222.15625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4280780553817749, "epoch": 1.088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.0223459707735028, "kl": 0.036168213933706284, "learning_rate": 8.055803006615965e-07, "loss": -0.1477, "num_tokens": 28108428.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.384476900100708, "sampling/importance_sampling_ratio/mean": 0.9997614622116089, "sampling/importance_sampling_ratio/min": 0.6444646120071411, "sampling/sampling_logp_difference/max": 0.43933534622192383, "sampling/sampling_logp_difference/mean": 0.014062181115150452, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 170.015625, "completions/mean_terminated_length": 170.015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4003869891166687, "epoch": 1.0894607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.02230605781685738, "kl": 0.03253406286239624, "learning_rate": 8.050161377813485e-07, "loss": 0.0003, "num_tokens": 28138093.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4407479763031006, "sampling/importance_sampling_ratio/mean": 0.9998146891593933, "sampling/importance_sampling_ratio/min": 0.6737135648727417, "sampling/sampling_logp_difference/max": 0.394950270652771, "sampling/sampling_logp_difference/mean": 0.015155184082686901, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 182.046875, "completions/mean_terminated_length": 182.046875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.43908727169036865, "epoch": 1.0906862745098038, "frac_reward_zero_std": 1.0, "grad_norm": 0.06986324859247232, "kl": 0.03759096935391426, "learning_rate": 8.04451355737664e-07, "loss": 0.0004, "num_tokens": 28165760.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6099239587783813, "sampling/importance_sampling_ratio/mean": 1.0006098747253418, "sampling/importance_sampling_ratio/min": 0.12663482129573822, "sampling/sampling_logp_difference/max": 2.0664477348327637, "sampling/sampling_logp_difference/mean": 0.0157114639878273, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 198.65625, "completions/mean_terminated_length": 198.65625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4717601537704468, "epoch": 1.0919117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.03376924190707671, "kl": 0.04111418128013611, "learning_rate": 8.03885955677015e-07, "loss": 0.0004, "num_tokens": 28200954.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5522520542144775, "sampling/importance_sampling_ratio/mean": 1.0006616115570068, "sampling/importance_sampling_ratio/min": 0.664037823677063, "sampling/sampling_logp_difference/max": 0.43970680236816406, "sampling/sampling_logp_difference/mean": 0.01586918905377388, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 233.34375, "completions/mean_terminated_length": 233.34375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4475059509277344, "epoch": 1.093137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.7565513334532075, "kl": 0.04115685820579529, "learning_rate": 8.033199387471276e-07, "loss": 0.0124, "num_tokens": 28245104.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4491372108459473, "sampling/importance_sampling_ratio/mean": 1.0002517700195312, "sampling/importance_sampling_ratio/min": 0.5148535966873169, "sampling/sampling_logp_difference/max": 0.6638727188110352, "sampling/sampling_logp_difference/mean": 0.015666604042053223, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 226.484375, "completions/mean_terminated_length": 226.484375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.35593605041503906, "epoch": 1.094362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.015531131966394135, "kl": 0.030154259875416756, "learning_rate": 8.027533060969806e-07, "loss": 0.0003, "num_tokens": 28280367.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3830482959747314, "sampling/importance_sampling_ratio/mean": 0.9998922944068909, "sampling/importance_sampling_ratio/min": 0.5699650645256042, "sampling/sampling_logp_difference/max": 0.5621802806854248, "sampling/sampling_logp_difference/mean": 0.012990564107894897, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 215.15625, "completions/mean_terminated_length": 215.15625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.41725218296051025, "epoch": 1.0955882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.6636959082142803, "kl": 0.02948814630508423, "learning_rate": 8.021860588768021e-07, "loss": -0.0097, "num_tokens": 28309177.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4687278270721436, "sampling/importance_sampling_ratio/mean": 1.0000178813934326, "sampling/importance_sampling_ratio/min": 0.6812041997909546, "sampling/sampling_logp_difference/max": 0.3843965530395508, "sampling/sampling_logp_difference/mean": 0.014885883778333664, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 214.609375, "completions/mean_terminated_length": 214.609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.40551576018333435, "epoch": 1.0968137254901962, "frac_reward_zero_std": 1.0, "grad_norm": 0.017015254100040154, "kl": 0.02709297090768814, "learning_rate": 8.016181982380681e-07, "loss": 0.0003, "num_tokens": 28340528.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.445603609085083, "sampling/importance_sampling_ratio/mean": 1.000315546989441, "sampling/importance_sampling_ratio/min": 0.6513819098472595, "sampling/sampling_logp_difference/max": 0.4286590814590454, "sampling/sampling_logp_difference/mean": 0.014110507443547249, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 141.015625, "completions/mean_terminated_length": 141.015625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.36902138590812683, "epoch": 1.0980392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.021789493080759108, "kl": 0.03306157514452934, "learning_rate": 8.010497253335e-07, "loss": 0.0003, "num_tokens": 28365153.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3766553401947021, "sampling/importance_sampling_ratio/mean": 1.0002944469451904, "sampling/importance_sampling_ratio/min": 0.6906982064247131, "sampling/sampling_logp_difference/max": 0.3700523376464844, "sampling/sampling_logp_difference/mean": 0.0150392334908247, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 220.5625, "completions/mean_terminated_length": 220.5625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.41868674755096436, "epoch": 1.099264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.5352552104936306, "kl": 0.028247395530343056, "learning_rate": 8.004806413170612e-07, "loss": -0.0639, "num_tokens": 28396261.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6137233972549438, "sampling/importance_sampling_ratio/mean": 1.000014066696167, "sampling/importance_sampling_ratio/min": 0.7313059568405151, "sampling/sampling_logp_difference/max": 0.4785442352294922, "sampling/sampling_logp_difference/mean": 0.014314599335193634, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 225.828125, "completions/mean_terminated_length": 225.828125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4448871612548828, "epoch": 1.1004901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.01983500442272238, "kl": 0.03638867661356926, "learning_rate": 7.999109473439569e-07, "loss": 0.0003, "num_tokens": 28427466.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5072325468063354, "sampling/importance_sampling_ratio/mean": 1.0005900859832764, "sampling/importance_sampling_ratio/min": 0.6686456203460693, "sampling/sampling_logp_difference/max": 0.4102752208709717, "sampling/sampling_logp_difference/mean": 0.015608585439622402, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 209.109375, "completions/mean_terminated_length": 209.109375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3570801615715027, "epoch": 1.1017156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 0.6537754568071588, "kl": 0.039102356880903244, "learning_rate": 7.993406445706292e-07, "loss": 0.0209, "num_tokens": 28459969.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.441141128540039, "sampling/importance_sampling_ratio/mean": 0.9999960660934448, "sampling/importance_sampling_ratio/min": 0.6254708170890808, "sampling/sampling_logp_difference/max": 0.4692506790161133, "sampling/sampling_logp_difference/mean": 0.013170319609344006, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3478234112262726, "epoch": 1.1029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.016751932293840175, "kl": 0.0287268478423357, "learning_rate": 7.987697341547568e-07, "loss": 0.0003, "num_tokens": 28484805.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4843366146087646, "sampling/importance_sampling_ratio/mean": 1.000046968460083, "sampling/importance_sampling_ratio/min": 0.6069750785827637, "sampling/sampling_logp_difference/max": 0.499267578125, "sampling/sampling_logp_difference/mean": 0.01609545573592186, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 193.546875, "completions/mean_terminated_length": 193.546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.37323224544525146, "epoch": 1.1041666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.015952773364252184, "kl": 0.030691295862197876, "learning_rate": 7.981982172552517e-07, "loss": 0.0003, "num_tokens": 28515672.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.44472074508667, "sampling/importance_sampling_ratio/mean": 1.0006976127624512, "sampling/importance_sampling_ratio/min": 0.693408727645874, "sampling/sampling_logp_difference/max": 0.3679161071777344, "sampling/sampling_logp_difference/mean": 0.014546047896146774, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 167.53125, "completions/mean_terminated_length": 167.53125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.32717472314834595, "epoch": 1.1053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.02514619694392657, "kl": 0.031751178205013275, "learning_rate": 7.976260950322571e-07, "loss": 0.0003, "num_tokens": 28540826.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5320740938186646, "sampling/importance_sampling_ratio/mean": 0.9992892146110535, "sampling/importance_sampling_ratio/min": 0.4944298565387726, "sampling/sampling_logp_difference/max": 0.7043499946594238, "sampling/sampling_logp_difference/mean": 0.014897492714226246, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 222.8125, "completions/mean_terminated_length": 222.8125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.45857465267181396, "epoch": 1.1066176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.8008073300805483, "kl": 0.03539489582180977, "learning_rate": 7.970533686471448e-07, "loss": 0.0055, "num_tokens": 28579134.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3872824907302856, "sampling/importance_sampling_ratio/mean": 0.9999810457229614, "sampling/importance_sampling_ratio/min": 0.6253830790519714, "sampling/sampling_logp_difference/max": 0.469390869140625, "sampling/sampling_logp_difference/mean": 0.015618794597685337, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 200.328125, "completions/mean_terminated_length": 200.328125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.30752766132354736, "epoch": 1.107843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.01553239892624823, "kl": 0.026053044945001602, "learning_rate": 7.964800392625128e-07, "loss": 0.0002, "num_tokens": 28610195.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 0.9998340010643005, "sampling/importance_sampling_ratio/min": 0.680486798286438, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.012396201491355896, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 174.0625, "completions/mean_terminated_length": 174.0625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.323736310005188, "epoch": 1.1090686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.8197624338844457, "kl": 0.03432891145348549, "learning_rate": 7.959061080421838e-07, "loss": -0.0107, "num_tokens": 28639703.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6952557563781738, "sampling/importance_sampling_ratio/mean": 0.9998811483383179, "sampling/importance_sampling_ratio/min": 0.7749429941177368, "sampling/sampling_logp_difference/max": 0.5278335809707642, "sampling/sampling_logp_difference/mean": 0.0137240681797266, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.40583595633506775, "epoch": 1.1102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.024540924093596973, "kl": 0.03756669536232948, "learning_rate": 7.953315761512017e-07, "loss": 0.0004, "num_tokens": 28666139.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4206631183624268, "sampling/importance_sampling_ratio/mean": 0.9998972415924072, "sampling/importance_sampling_ratio/min": 0.6482253074645996, "sampling/sampling_logp_difference/max": 0.4335169792175293, "sampling/sampling_logp_difference/mean": 0.0156668983399868, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 198.90625, "completions/mean_terminated_length": 198.90625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.43321409821510315, "epoch": 1.1115196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.016655306830246297, "kl": 0.028577543795108795, "learning_rate": 7.947564447558299e-07, "loss": 0.0003, "num_tokens": 28694597.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.286690354347229, "sampling/importance_sampling_ratio/mean": 1.0003197193145752, "sampling/importance_sampling_ratio/min": 0.7250173687934875, "sampling/sampling_logp_difference/max": 0.3215596675872803, "sampling/sampling_logp_difference/mean": 0.015658333897590637, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 224.984375, "completions/mean_terminated_length": 224.984375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4762854278087616, "epoch": 1.1127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.017923507835296153, "kl": 0.025353986769914627, "learning_rate": 7.941807150235485e-07, "loss": 0.0003, "num_tokens": 28730180.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.635606050491333, "sampling/importance_sampling_ratio/mean": 1.000074863433838, "sampling/importance_sampling_ratio/min": 0.7178359627723694, "sampling/sampling_logp_difference/max": 0.49201345443725586, "sampling/sampling_logp_difference/mean": 0.015965506434440613, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 246.34375, "completions/mean_terminated_length": 246.34375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3268033564090729, "epoch": 1.1139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01619742569804815, "kl": 0.02567506767809391, "learning_rate": 7.936043881230525e-07, "loss": 0.0002, "num_tokens": 28764746.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4772764444351196, "sampling/importance_sampling_ratio/mean": 1.0004796981811523, "sampling/importance_sampling_ratio/min": 0.6623192429542542, "sampling/sampling_logp_difference/max": 0.41200757026672363, "sampling/sampling_logp_difference/mean": 0.012335414066910744, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 223.015625, "completions/mean_terminated_length": 223.015625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4025377333164215, "epoch": 1.1151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.01725664674190133, "kl": 0.030949950218200684, "learning_rate": 7.930274652242491e-07, "loss": 0.0003, "num_tokens": 28795531.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744267702102661, "sampling/importance_sampling_ratio/mean": 1.000022530555725, "sampling/importance_sampling_ratio/min": 0.6672366857528687, "sampling/sampling_logp_difference/max": 0.4538912773132324, "sampling/sampling_logp_difference/mean": 0.015366973355412483, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 190.484375, "completions/mean_terminated_length": 190.484375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3960995376110077, "epoch": 1.116421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.04813797084800073, "kl": 0.04051578789949417, "learning_rate": 7.924499474982551e-07, "loss": 0.0004, "num_tokens": 28832458.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.566362738609314, "sampling/importance_sampling_ratio/mean": 1.0004899501800537, "sampling/importance_sampling_ratio/min": 0.6762630939483643, "sampling/sampling_logp_difference/max": 0.44875621795654297, "sampling/sampling_logp_difference/mean": 0.014583173207938671, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 195.78125, "completions/mean_terminated_length": 195.78125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.47338631749153137, "epoch": 1.1176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.023064169387888838, "kl": 0.029132433235645294, "learning_rate": 7.91871836117395e-07, "loss": 0.0003, "num_tokens": 28859196.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5592929124832153, "sampling/importance_sampling_ratio/mean": 1.0006070137023926, "sampling/importance_sampling_ratio/min": 0.7059069275856018, "sampling/sampling_logp_difference/max": 0.4442324638366699, "sampling/sampling_logp_difference/mean": 0.01768975704908371, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 189.4375, "completions/mean_terminated_length": 189.4375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3763211667537689, "epoch": 1.1188725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.01928194689201758, "kl": 0.02983258292078972, "learning_rate": 7.91293132255198e-07, "loss": 0.0003, "num_tokens": 28892296.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999362230300903, "sampling/importance_sampling_ratio/min": 0.6273563504219055, "sampling/sampling_logp_difference/max": 0.8365778923034668, "sampling/sampling_logp_difference/mean": 0.01438450999557972, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.363294780254364, "epoch": 1.1200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.017272240877062427, "kl": 0.02581132762134075, "learning_rate": 7.907138370863967e-07, "loss": 0.0003, "num_tokens": 28921964.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6357020139694214, "sampling/importance_sampling_ratio/mean": 1.0008468627929688, "sampling/importance_sampling_ratio/min": 0.1908649355173111, "sampling/sampling_logp_difference/max": 1.656189203262329, "sampling/sampling_logp_difference/mean": 0.01467039342969656, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 213.046875, "completions/mean_terminated_length": 213.046875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.42518073320388794, "epoch": 1.1213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01869874756915245, "kl": 0.027808737009763718, "learning_rate": 7.901339517869232e-07, "loss": 0.0003, "num_tokens": 28955839.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6528089046478271, "sampling/importance_sampling_ratio/mean": 1.0004780292510986, "sampling/importance_sampling_ratio/min": 0.7308378219604492, "sampling/sampling_logp_difference/max": 0.5024762153625488, "sampling/sampling_logp_difference/mean": 0.01490770187228918, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 162.5625, "completions/mean_terminated_length": 162.5625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3367150127887726, "epoch": 1.1225490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.017023830337671672, "kl": 0.027068888768553734, "learning_rate": 7.895534775339083e-07, "loss": 0.0003, "num_tokens": 28986899.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3988438844680786, "sampling/importance_sampling_ratio/mean": 0.9999134540557861, "sampling/importance_sampling_ratio/min": 0.6238155961036682, "sampling/sampling_logp_difference/max": 0.47190046310424805, "sampling/sampling_logp_difference/mean": 0.013619141653180122, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 228.640625, "completions/mean_terminated_length": 228.640625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3943815529346466, "epoch": 1.1237745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.02079319192360828, "kl": 0.03533846512436867, "learning_rate": 7.889724155056776e-07, "loss": 0.0003, "num_tokens": 29029340.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9286022186279297, "sampling/importance_sampling_ratio/mean": 1.0006740093231201, "sampling/importance_sampling_ratio/min": 0.5990237593650818, "sampling/sampling_logp_difference/max": 0.6567955017089844, "sampling/sampling_logp_difference/mean": 0.014496782794594765, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 271.28125, "completions/mean_terminated_length": 271.28125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4110572934150696, "epoch": 1.125, "frac_reward_zero_std": 1.0, "grad_norm": 0.022792274933252598, "kl": 0.027118559926748276, "learning_rate": 7.883907668817506e-07, "loss": 0.0003, "num_tokens": 29066958.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.438292384147644, "sampling/importance_sampling_ratio/mean": 0.9999451637268066, "sampling/importance_sampling_ratio/min": 0.7216097712516785, "sampling/sampling_logp_difference/max": 0.36345648765563965, "sampling/sampling_logp_difference/mean": 0.013454319909214973, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 261.46875, "completions/mean_terminated_length": 261.46875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.38979044556617737, "epoch": 1.1262254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.02604291304095247, "kl": 0.029376091435551643, "learning_rate": 7.878085328428368e-07, "loss": 0.0003, "num_tokens": 29100028.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4420621395111084, "sampling/importance_sampling_ratio/mean": 1.0001835823059082, "sampling/importance_sampling_ratio/min": 0.6249132752418518, "sampling/sampling_logp_difference/max": 0.4701423645019531, "sampling/sampling_logp_difference/mean": 0.014051743783056736, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 191.90625, "completions/mean_terminated_length": 191.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.41036784648895264, "epoch": 1.1274509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.016530115017810088, "kl": 0.02435620129108429, "learning_rate": 7.872257145708345e-07, "loss": 0.0002, "num_tokens": 29133910.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6928790807724, "sampling/importance_sampling_ratio/mean": 0.9995551705360413, "sampling/importance_sampling_ratio/min": 0.6134859919548035, "sampling/sampling_logp_difference/max": 0.5264307260513306, "sampling/sampling_logp_difference/mean": 0.0158233679831028, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 201.390625, "completions/mean_terminated_length": 201.390625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.29508695006370544, "epoch": 1.1286764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.013986273065847703, "kl": 0.018711119890213013, "learning_rate": 7.86642313248828e-07, "loss": 0.0002, "num_tokens": 29162495.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.521493673324585, "sampling/importance_sampling_ratio/mean": 0.999792754650116, "sampling/importance_sampling_ratio/min": 0.6860750913619995, "sampling/sampling_logp_difference/max": 0.4196925163269043, "sampling/sampling_logp_difference/mean": 0.0119221406057477, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 299.90625, "completions/mean_terminated_length": 299.90625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.42031940817832947, "epoch": 1.1299019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.9134031683097505, "kl": 0.031263936311006546, "learning_rate": 7.860583300610847e-07, "loss": 0.0032, "num_tokens": 29206041.0, "reward": -0.28125, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5278496742248535, "sampling/importance_sampling_ratio/mean": 1.0002261400222778, "sampling/importance_sampling_ratio/min": 0.5127838253974915, "sampling/sampling_logp_difference/max": 0.6679009199142456, "sampling/sampling_logp_difference/mean": 0.01367473229765892, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 213.765625, "completions/mean_terminated_length": 213.765625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.45548272132873535, "epoch": 1.1311274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.01808893076897117, "kl": 0.028725363314151764, "learning_rate": 7.854737661930539e-07, "loss": 0.0003, "num_tokens": 29235082.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4327467679977417, "sampling/importance_sampling_ratio/mean": 1.0003161430358887, "sampling/importance_sampling_ratio/min": 0.6732383370399475, "sampling/sampling_logp_difference/max": 0.39565587043762207, "sampling/sampling_logp_difference/mean": 0.016030866652727127, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 219.390625, "completions/mean_terminated_length": 219.390625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3964219391345978, "epoch": 1.1323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013289578435529128, "kl": 0.024509485810995102, "learning_rate": 7.848886228313632e-07, "loss": 0.0002, "num_tokens": 29268851.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3204513788223267, "sampling/importance_sampling_ratio/mean": 1.0001609325408936, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.0140391755849123, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 172.796875, "completions/mean_terminated_length": 172.796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2802438735961914, "epoch": 1.133578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.015751847974729585, "kl": 0.02570202574133873, "learning_rate": 7.843029011638162e-07, "loss": 0.0002, "num_tokens": 29293078.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4692705869674683, "sampling/importance_sampling_ratio/mean": 1.00046968460083, "sampling/importance_sampling_ratio/min": 0.6958165168762207, "sampling/sampling_logp_difference/max": 0.3847661018371582, "sampling/sampling_logp_difference/mean": 0.012318532913923264, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 197.328125, "completions/mean_terminated_length": 197.328125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3475108742713928, "epoch": 1.1348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.024025209368769927, "kl": 0.03196142613887787, "learning_rate": 7.837166023793908e-07, "loss": 0.0003, "num_tokens": 29324795.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5470669269561768, "sampling/importance_sampling_ratio/mean": 0.9998123049736023, "sampling/importance_sampling_ratio/min": 0.752821683883667, "sampling/sampling_logp_difference/max": 0.43636083602905273, "sampling/sampling_logp_difference/mean": 0.01308157853782177, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 172.578125, "completions/mean_terminated_length": 172.578125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3719062805175781, "epoch": 1.1360294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.026864161189479176, "kl": 0.03267707675695419, "learning_rate": 7.831297276682368e-07, "loss": 0.0003, "num_tokens": 29349952.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6383248567581177, "sampling/importance_sampling_ratio/mean": 0.9998093843460083, "sampling/importance_sampling_ratio/min": 0.6067643761634827, "sampling/sampling_logp_difference/max": 0.4996147155761719, "sampling/sampling_logp_difference/mean": 0.016318414360284805, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 214.140625, "completions/mean_terminated_length": 214.140625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4109461307525635, "epoch": 1.1372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.023827732349712038, "kl": 0.03133368492126465, "learning_rate": 7.825422782216724e-07, "loss": 0.0003, "num_tokens": 29384041.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3573529720306396, "sampling/importance_sampling_ratio/mean": 0.9996014833450317, "sampling/importance_sampling_ratio/min": 0.65472412109375, "sampling/sampling_logp_difference/max": 0.4235413074493408, "sampling/sampling_logp_difference/mean": 0.014543693512678146, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 200.4375, "completions/mean_terminated_length": 200.4375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4032849371433258, "epoch": 1.1384803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.015633853504003762, "kl": 0.022977590560913086, "learning_rate": 7.819542552321827e-07, "loss": 0.0002, "num_tokens": 29412405.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 1.0006990432739258, "sampling/importance_sampling_ratio/min": 0.6202813386917114, "sampling/sampling_logp_difference/max": 0.49199533462524414, "sampling/sampling_logp_difference/mean": 0.014526978135108948, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 220.9375, "completions/mean_terminated_length": 220.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4555209279060364, "epoch": 1.1397058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.018454166474342476, "kl": 0.029399922117590904, "learning_rate": 7.813656598934173e-07, "loss": 0.0003, "num_tokens": 29443825.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4754360914230347, "sampling/importance_sampling_ratio/mean": 0.9996351003646851, "sampling/importance_sampling_ratio/min": 0.6344634294509888, "sampling/sampling_logp_difference/max": 0.45497560501098633, "sampling/sampling_logp_difference/mean": 0.016958877444267273, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 249.84375, "completions/mean_terminated_length": 249.84375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.34868285059928894, "epoch": 1.1409313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.013567642960694574, "kl": 0.021005570888519287, "learning_rate": 7.807764934001874e-07, "loss": 0.0002, "num_tokens": 29476423.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6121875047683716, "sampling/importance_sampling_ratio/mean": 1.000044345855713, "sampling/importance_sampling_ratio/min": 0.6066538691520691, "sampling/sampling_logp_difference/max": 0.49979686737060547, "sampling/sampling_logp_difference/mean": 0.013358568772673607, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 216.390625, "completions/mean_terminated_length": 216.390625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4077160358428955, "epoch": 1.142156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.6618215372445466, "kl": 0.03380703181028366, "learning_rate": 7.801867569484634e-07, "loss": 0.0233, "num_tokens": 29512752.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4813718795776367, "sampling/importance_sampling_ratio/mean": 0.9998239278793335, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4679851531982422, "sampling/sampling_logp_difference/mean": 0.015345785766839981, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 227.796875, "completions/mean_terminated_length": 227.796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.42371150851249695, "epoch": 1.1433823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.016883806363186343, "kl": 0.022962680086493492, "learning_rate": 7.795964517353733e-07, "loss": 0.0002, "num_tokens": 29543059.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8718515634536743, "sampling/importance_sampling_ratio/mean": 1.0001614093780518, "sampling/importance_sampling_ratio/min": 0.7095545530319214, "sampling/sampling_logp_difference/max": 0.6269280910491943, "sampling/sampling_logp_difference/mean": 0.014873354695737362, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.38495272397994995, "epoch": 1.1446078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.020400968831353655, "kl": 0.03413142263889313, "learning_rate": 7.790055789591993e-07, "loss": 0.0003, "num_tokens": 29571067.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5023506879806519, "sampling/importance_sampling_ratio/mean": 0.9996978044509888, "sampling/importance_sampling_ratio/min": 0.664975106716156, "sampling/sampling_logp_difference/max": 0.4080057144165039, "sampling/sampling_logp_difference/mean": 0.015339599922299385, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 152.671875, "completions/mean_terminated_length": 152.671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.39188700914382935, "epoch": 1.1458333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193314784301376, "kl": 0.025547783821821213, "learning_rate": 7.784141398193753e-07, "loss": 0.0003, "num_tokens": 29605158.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4144792556762695, "sampling/importance_sampling_ratio/mean": 0.9997140765190125, "sampling/importance_sampling_ratio/min": 0.6417416334152222, "sampling/sampling_logp_difference/max": 0.4435694217681885, "sampling/sampling_logp_difference/mean": 0.015097095631062984, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.38108593225479126, "epoch": 1.1470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012087511602747795, "kl": 0.01746346801519394, "learning_rate": 7.778221355164857e-07, "loss": 0.0002, "num_tokens": 29648670.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.751657247543335, "sampling/importance_sampling_ratio/mean": 1.0000417232513428, "sampling/importance_sampling_ratio/min": 0.6792500615119934, "sampling/sampling_logp_difference/max": 0.5605623722076416, "sampling/sampling_logp_difference/mean": 0.014899947680532932, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 208.90625, "completions/mean_terminated_length": 208.90625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.37955600023269653, "epoch": 1.1482843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.01647561168103897, "kl": 0.02749289572238922, "learning_rate": 7.772295672522614e-07, "loss": 0.0003, "num_tokens": 29680936.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6303905248641968, "sampling/importance_sampling_ratio/mean": 1.0001499652862549, "sampling/importance_sampling_ratio/min": 0.677166223526001, "sampling/sampling_logp_difference/max": 0.48881959915161133, "sampling/sampling_logp_difference/mean": 0.013886782340705395, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 213.3125, "completions/mean_terminated_length": 213.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4735932946205139, "epoch": 1.1495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7218453170992659, "kl": 0.028180256485939026, "learning_rate": 7.766364362295788e-07, "loss": -0.013, "num_tokens": 29713772.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.3298238515853882, "sampling/importance_sampling_ratio/mean": 0.9997259974479675, "sampling/importance_sampling_ratio/min": 0.6942225694656372, "sampling/sampling_logp_difference/max": 0.3649625778198242, "sampling/sampling_logp_difference/mean": 0.0170447938144207, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 225.078125, "completions/mean_terminated_length": 225.078125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.40240204334259033, "epoch": 1.150735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01867880778206093, "kl": 0.024466482922434807, "learning_rate": 7.760427436524559e-07, "loss": 0.0002, "num_tokens": 29746673.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5001312494277954, "sampling/importance_sampling_ratio/mean": 0.9998698830604553, "sampling/importance_sampling_ratio/min": 0.6142581701278687, "sampling/sampling_logp_difference/max": 0.48733997344970703, "sampling/sampling_logp_difference/mean": 0.014945314265787601, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 215.734375, "completions/mean_terminated_length": 215.734375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4451993405818939, "epoch": 1.1519607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.01680554671806869, "kl": 0.027862906455993652, "learning_rate": 7.754484907260512e-07, "loss": 0.0003, "num_tokens": 29778192.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4360154867172241, "sampling/importance_sampling_ratio/mean": 1.0001566410064697, "sampling/importance_sampling_ratio/min": 0.6816403865814209, "sampling/sampling_logp_difference/max": 0.3832530975341797, "sampling/sampling_logp_difference/mean": 0.016417954117059708, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 256.078125, "completions/mean_terminated_length": 256.078125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3542017340660095, "epoch": 1.153186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.7621131140916966, "kl": 0.021910440176725388, "learning_rate": 7.748536786566606e-07, "loss": 0.0041, "num_tokens": 29813653.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5361313819885254, "sampling/importance_sampling_ratio/mean": 0.9997000098228455, "sampling/importance_sampling_ratio/min": 0.6090093851089478, "sampling/sampling_logp_difference/max": 0.49592161178588867, "sampling/sampling_logp_difference/mean": 0.012756800279021263, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 207.140625, "completions/mean_terminated_length": 207.140625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3920484185218811, "epoch": 1.1544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.020545587392545597, "kl": 0.025344235822558403, "learning_rate": 7.742583086517149e-07, "loss": 0.0003, "num_tokens": 29849198.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4313565492630005, "sampling/importance_sampling_ratio/mean": 1.00014328956604, "sampling/importance_sampling_ratio/min": 0.6174163818359375, "sampling/sampling_logp_difference/max": 0.4822115898132324, "sampling/sampling_logp_difference/mean": 0.014016522094607353, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 225.46875, "completions/mean_terminated_length": 225.46875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.32754069566726685, "epoch": 1.155637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.017125159976420713, "kl": 0.019937563687562943, "learning_rate": 7.736623819197773e-07, "loss": 0.0002, "num_tokens": 29881660.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4777964353561401, "sampling/importance_sampling_ratio/mean": 1.0002424716949463, "sampling/importance_sampling_ratio/min": 0.6300315856933594, "sampling/sampling_logp_difference/max": 0.46198534965515137, "sampling/sampling_logp_difference/mean": 0.013355264440178871, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 197.59375, "completions/mean_terminated_length": 197.59375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.27188652753829956, "epoch": 1.156862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.01673616429022431, "kl": 0.01753295212984085, "learning_rate": 7.730658996705415e-07, "loss": 0.0002, "num_tokens": 29913826.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.491621732711792, "sampling/importance_sampling_ratio/mean": 1.0000965595245361, "sampling/importance_sampling_ratio/min": 0.6122355461120605, "sampling/sampling_logp_difference/max": 0.49063825607299805, "sampling/sampling_logp_difference/mean": 0.012826650403439999, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 170.203125, "completions/mean_terminated_length": 170.203125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.39124971628189087, "epoch": 1.1580882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.9643865686120452, "kl": 0.03476168215274811, "learning_rate": 7.724688631148286e-07, "loss": -0.0043, "num_tokens": 29942831.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4753769636154175, "sampling/importance_sampling_ratio/mean": 0.999715268611908, "sampling/importance_sampling_ratio/min": 0.6182939410209656, "sampling/sampling_logp_difference/max": 0.4807913303375244, "sampling/sampling_logp_difference/mean": 0.014948323369026184, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 229.96875, "completions/mean_terminated_length": 229.96875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35787203907966614, "epoch": 1.159313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.015282129492059498, "kl": 0.021442048251628876, "learning_rate": 7.718712734645849e-07, "loss": 0.0002, "num_tokens": 29975725.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.632232904434204, "sampling/importance_sampling_ratio/mean": 0.999840259552002, "sampling/importance_sampling_ratio/min": 0.5060733556747437, "sampling/sampling_logp_difference/max": 0.6810736656188965, "sampling/sampling_logp_difference/mean": 0.013961701653897762, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.36832353472709656, "epoch": 1.1605392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.023454253214559635, "kl": 0.03226148337125778, "learning_rate": 7.712731319328797e-07, "loss": 0.0003, "num_tokens": 30005557.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3227823972702026, "sampling/importance_sampling_ratio/mean": 1.000182867050171, "sampling/importance_sampling_ratio/min": 0.6053187847137451, "sampling/sampling_logp_difference/max": 0.502000093460083, "sampling/sampling_logp_difference/mean": 0.014252797700464725, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 164.46875, "completions/mean_terminated_length": 164.46875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3432360589504242, "epoch": 1.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0163520986602648, "kl": 0.021451342850923538, "learning_rate": 7.706744397339022e-07, "loss": 0.0002, "num_tokens": 30031971.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4416695833206177, "sampling/importance_sampling_ratio/mean": 0.9997134804725647, "sampling/importance_sampling_ratio/min": 0.6160857677459717, "sampling/sampling_logp_difference/max": 0.48436903953552246, "sampling/sampling_logp_difference/mean": 0.015156615525484085, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 232.546875, "completions/mean_terminated_length": 232.546875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.44087710976600647, "epoch": 1.1629901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.9197137471296929, "kl": 0.026438506320118904, "learning_rate": 7.700751980829601e-07, "loss": 0.0183, "num_tokens": 30065638.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.461745262145996, "sampling/importance_sampling_ratio/mean": 0.9997392892837524, "sampling/importance_sampling_ratio/min": 0.6299211382865906, "sampling/sampling_logp_difference/max": 0.462160587310791, "sampling/sampling_logp_difference/mean": 0.01626892387866974, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 171.578125, "completions/mean_terminated_length": 171.578125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3260308504104614, "epoch": 1.1642156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 0.8716004942013297, "kl": 0.023889085277915, "learning_rate": 7.694754081964754e-07, "loss": -0.0032, "num_tokens": 30092187.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4729878902435303, "sampling/importance_sampling_ratio/mean": 1.000300407409668, "sampling/importance_sampling_ratio/min": 0.7207422852516174, "sampling/sampling_logp_difference/max": 0.38729286193847656, "sampling/sampling_logp_difference/mean": 0.013107338920235634, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 242.734375, "completions/mean_terminated_length": 242.734375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.34090811014175415, "epoch": 1.1654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01171329826460024, "kl": 0.019636783748865128, "learning_rate": 7.688750712919839e-07, "loss": 0.0002, "num_tokens": 30129146.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4735112190246582, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.5814192891120911, "sampling/sampling_logp_difference/max": 0.5422830581665039, "sampling/sampling_logp_difference/mean": 0.012322664260864258, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 240.953125, "completions/mean_terminated_length": 240.953125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4532848298549652, "epoch": 1.1666666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.5620988096409886, "kl": 0.028768297284841537, "learning_rate": 7.682741885881314e-07, "loss": 0.0026, "num_tokens": 30162903.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4075736999511719, "sampling/importance_sampling_ratio/mean": 1.0002433061599731, "sampling/importance_sampling_ratio/min": 0.7008883357048035, "sampling/sampling_logp_difference/max": 0.35540664196014404, "sampling/sampling_logp_difference/mean": 0.015215197578072548, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 207.46875, "completions/mean_terminated_length": 207.46875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4852984547615051, "epoch": 1.1678921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.01658368194230398, "kl": 0.024968475103378296, "learning_rate": 7.676727613046719e-07, "loss": 0.0003, "num_tokens": 30197557.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.39171302318573, "sampling/importance_sampling_ratio/mean": 1.0001628398895264, "sampling/importance_sampling_ratio/min": 0.6412505507469177, "sampling/sampling_logp_difference/max": 0.4443349838256836, "sampling/sampling_logp_difference/mean": 0.016897138208150864, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 191.328125, "completions/mean_terminated_length": 191.328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.42920494079589844, "epoch": 1.1691176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.8958474022836896, "kl": 0.052344582974910736, "learning_rate": 7.670707906624643e-07, "loss": -0.0074, "num_tokens": 30224538.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.574435830116272, "sampling/importance_sampling_ratio/mean": 0.9998064637184143, "sampling/importance_sampling_ratio/min": 0.4768851101398468, "sampling/sampling_logp_difference/max": 0.7404797077178955, "sampling/sampling_logp_difference/mean": 0.016219427809119225, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 198.796875, "completions/mean_terminated_length": 198.796875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3773866891860962, "epoch": 1.170343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.02937723992809643, "kl": 0.029355604201555252, "learning_rate": 7.664682778834712e-07, "loss": 0.0003, "num_tokens": 30254845.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4508373737335205, "sampling/importance_sampling_ratio/mean": 0.999887228012085, "sampling/importance_sampling_ratio/min": 0.6804096102714539, "sampling/sampling_logp_difference/max": 0.38506031036376953, "sampling/sampling_logp_difference/mean": 0.015669919550418854, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.45939186215400696, "epoch": 1.1715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.023625100950576464, "kl": 0.029001720249652863, "learning_rate": 7.658652241907554e-07, "loss": 0.0003, "num_tokens": 30282477.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6038897037506104, "sampling/importance_sampling_ratio/mean": 1.0003774166107178, "sampling/importance_sampling_ratio/min": 0.6473898887634277, "sampling/sampling_logp_difference/max": 0.47243165969848633, "sampling/sampling_logp_difference/mean": 0.01826980710029602, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 198.71875, "completions/mean_terminated_length": 198.71875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3485085666179657, "epoch": 1.1727941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.021714340953082334, "kl": 0.02960689179599285, "learning_rate": 7.652616308084774e-07, "loss": 0.0003, "num_tokens": 30314427.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.429936170578003, "sampling/importance_sampling_ratio/mean": 0.9995099306106567, "sampling/importance_sampling_ratio/min": 0.6600603461265564, "sampling/sampling_logp_difference/max": 0.415424108505249, "sampling/sampling_logp_difference/mean": 0.012969018891453743, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 200.59375, "completions/mean_terminated_length": 200.59375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3420395255088806, "epoch": 1.1740196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.6920962449991449, "kl": 0.03241267055273056, "learning_rate": 7.646574989618937e-07, "loss": -0.001, "num_tokens": 30343281.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3937641382217407, "sampling/importance_sampling_ratio/mean": 0.9999229907989502, "sampling/importance_sampling_ratio/min": 0.6786736845970154, "sampling/sampling_logp_difference/max": 0.3876148462295532, "sampling/sampling_logp_difference/mean": 0.014236288145184517, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 171.96875, "completions/mean_terminated_length": 171.96875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4044609069824219, "epoch": 1.1752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02605573369834518, "kl": 0.034083012491464615, "learning_rate": 7.640528298773536e-07, "loss": 0.0003, "num_tokens": 30370479.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.402698278427124, "sampling/importance_sampling_ratio/mean": 1.0000107288360596, "sampling/importance_sampling_ratio/min": 0.671543300151825, "sampling/sampling_logp_difference/max": 0.3981768488883972, "sampling/sampling_logp_difference/mean": 0.016809877008199692, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 179.890625, "completions/mean_terminated_length": 179.890625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.4167441725730896, "epoch": 1.1764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.019579150616267162, "kl": 0.03047661855816841, "learning_rate": 7.634476247822972e-07, "loss": 0.0003, "num_tokens": 30398056.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6280049085617065, "sampling/importance_sampling_ratio/mean": 0.9998691082000732, "sampling/importance_sampling_ratio/min": 0.6427965760231018, "sampling/sampling_logp_difference/max": 0.48735523223876953, "sampling/sampling_logp_difference/mean": 0.01612289622426033, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 196.515625, "completions/mean_terminated_length": 196.515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.35048040747642517, "epoch": 1.1776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.677471277336816, "kl": 0.02648363634943962, "learning_rate": 7.628418849052523e-07, "loss": -0.0071, "num_tokens": 30426217.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6926275491714478, "sampling/importance_sampling_ratio/mean": 1.0004546642303467, "sampling/importance_sampling_ratio/min": 0.6103792786598206, "sampling/sampling_logp_difference/max": 0.5262820720672607, "sampling/sampling_logp_difference/mean": 0.013000758364796638, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 239.15625, "completions/mean_terminated_length": 239.15625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.36057567596435547, "epoch": 1.178921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.020773236537636602, "kl": 0.02622954733669758, "learning_rate": 7.622356114758327e-07, "loss": 0.0003, "num_tokens": 30458483.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5686147212982178, "sampling/importance_sampling_ratio/mean": 0.9998708963394165, "sampling/importance_sampling_ratio/min": 0.6077016592025757, "sampling/sampling_logp_difference/max": 0.49807119369506836, "sampling/sampling_logp_difference/mean": 0.013564372435212135, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 175.8125, "completions/mean_terminated_length": 175.8125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3835163116455078, "epoch": 1.1801470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020059666231379888, "kl": 0.031151343137025833, "learning_rate": 7.616288057247349e-07, "loss": 0.0003, "num_tokens": 30488839.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5970481634140015, "sampling/importance_sampling_ratio/mean": 1.0011179447174072, "sampling/importance_sampling_ratio/min": 0.6336846351623535, "sampling/sampling_logp_difference/max": 0.4681570529937744, "sampling/sampling_logp_difference/mean": 0.01436849869787693, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 191.5625, "completions/mean_terminated_length": 191.5625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.338411420583725, "epoch": 1.1813725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.014550082356668844, "kl": 0.020898407325148582, "learning_rate": 7.610214688837361e-07, "loss": 0.0002, "num_tokens": 30527851.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4076093435287476, "sampling/importance_sampling_ratio/mean": 0.999811589717865, "sampling/importance_sampling_ratio/min": 0.5281767845153809, "sampling/sampling_logp_difference/max": 0.6383242607116699, "sampling/sampling_logp_difference/mean": 0.012341796420514584, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 659.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 171.359375, "completions/mean_terminated_length": 171.359375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.31215670704841614, "epoch": 1.1825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.018372978036901275, "kl": 0.024941347539424896, "learning_rate": 7.604136021856916e-07, "loss": 0.0002, "num_tokens": 30555282.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071580410003662, "sampling/importance_sampling_ratio/mean": 1.0002996921539307, "sampling/importance_sampling_ratio/min": 0.6637460589408875, "sampling/sampling_logp_difference/max": 0.4102257490158081, "sampling/sampling_logp_difference/mean": 0.013817012310028076, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 176.75, "completions/mean_terminated_length": 176.75, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3429717719554901, "epoch": 1.1838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017101053742247727, "kl": 0.022458365187048912, "learning_rate": 7.598052068645324e-07, "loss": 0.0002, "num_tokens": 30588722.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4652889966964722, "sampling/importance_sampling_ratio/mean": 1.0002515316009521, "sampling/importance_sampling_ratio/min": 0.6105897426605225, "sampling/sampling_logp_difference/max": 0.4933300018310547, "sampling/sampling_logp_difference/mean": 0.0130779342725873, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.32716017961502075, "epoch": 1.1850490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.9011340308409649, "kl": 0.02837306074798107, "learning_rate": 7.591962841552626e-07, "loss": -0.023, "num_tokens": 30625202.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4999078512191772, "sampling/importance_sampling_ratio/mean": 0.9999231696128845, "sampling/importance_sampling_ratio/min": 0.6259011626243591, "sampling/sampling_logp_difference/max": 0.4685628414154053, "sampling/sampling_logp_difference/mean": 0.013338066637516022, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 217.34375, "completions/mean_terminated_length": 217.34375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.39989498257637024, "epoch": 1.1862745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7684875311643962, "kl": 0.04626480117440224, "learning_rate": 7.585868352939562e-07, "loss": 0.0353, "num_tokens": 30655992.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5753024816513062, "sampling/importance_sampling_ratio/mean": 1.0001583099365234, "sampling/importance_sampling_ratio/min": 0.7037184238433838, "sampling/sampling_logp_difference/max": 0.45444726943969727, "sampling/sampling_logp_difference/mean": 0.015304194763302803, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4833433926105499, "epoch": 1.1875, "frac_reward_zero_std": 0.75, "grad_norm": 0.8896629453698057, "kl": 0.03638318181037903, "learning_rate": 7.579768615177564e-07, "loss": -0.0383, "num_tokens": 30686696.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3544316291809082, "sampling/importance_sampling_ratio/mean": 0.9995909929275513, "sampling/importance_sampling_ratio/min": 0.6309517025947571, "sampling/sampling_logp_difference/max": 0.4605259895324707, "sampling/sampling_logp_difference/mean": 0.01705067604780197, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.42330271005630493, "epoch": 1.1887254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.6170632654853302, "kl": 0.03930829092860222, "learning_rate": 7.57366364064871e-07, "loss": -0.0055, "num_tokens": 30726616.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5312823057174683, "sampling/importance_sampling_ratio/mean": 0.9995028972625732, "sampling/importance_sampling_ratio/min": 0.6076141595840454, "sampling/sampling_logp_difference/max": 0.4982151985168457, "sampling/sampling_logp_difference/mean": 0.014564193785190582, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 248.40625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.39436405897140503, "epoch": 1.1899509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.0215890274466495, "kl": 0.03378557786345482, "learning_rate": 7.567553441745711e-07, "loss": -0.0112, "num_tokens": 30765666.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6150052547454834, "sampling/importance_sampling_ratio/mean": 0.9998675584793091, "sampling/importance_sampling_ratio/min": 0.6254509687423706, "sampling/sampling_logp_difference/max": 0.4793381690979004, "sampling/sampling_logp_difference/mean": 0.01351084467023611, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4210416078567505, "epoch": 1.1911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.020457549262549238, "kl": 0.027543930336833, "learning_rate": 7.561438030871885e-07, "loss": 0.0003, "num_tokens": 30797522.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5265755653381348, "sampling/importance_sampling_ratio/mean": 1.0002741813659668, "sampling/importance_sampling_ratio/min": 0.5889442563056946, "sampling/sampling_logp_difference/max": 0.529423713684082, "sampling/sampling_logp_difference/mean": 0.01547271478921175, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.36282581090927124, "epoch": 1.1924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.8702373210575008, "kl": 0.03914202004671097, "learning_rate": 7.555317420441129e-07, "loss": -0.0493, "num_tokens": 30836442.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5971460342407227, "sampling/importance_sampling_ratio/mean": 1.0002176761627197, "sampling/importance_sampling_ratio/min": 0.6056472063064575, "sampling/sampling_logp_difference/max": 0.501457691192627, "sampling/sampling_logp_difference/mean": 0.011751336045563221, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 255.578125, "completions/mean_terminated_length": 255.578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3856160342693329, "epoch": 1.1936274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.026546802403762147, "kl": 0.04335259646177292, "learning_rate": 7.549191622877892e-07, "loss": 0.0005, "num_tokens": 30871407.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5312490463256836, "sampling/importance_sampling_ratio/mean": 1.0002844333648682, "sampling/importance_sampling_ratio/min": 0.6207950115203857, "sampling/sampling_logp_difference/max": 0.47675442695617676, "sampling/sampling_logp_difference/mean": 0.013014108873903751, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 264.703125, "completions/mean_terminated_length": 264.703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4230411648750305, "epoch": 1.1948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.5991911776847293, "kl": 0.026629038155078888, "learning_rate": 7.543060650617158e-07, "loss": 0.0116, "num_tokens": 30906860.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5496594905853271, "sampling/importance_sampling_ratio/mean": 0.9999898076057434, "sampling/importance_sampling_ratio/min": 0.6419976949691772, "sampling/sampling_logp_difference/max": 0.44317054748535156, "sampling/sampling_logp_difference/mean": 0.013829650357365608, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 263.140625, "completions/mean_terminated_length": 263.140625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4759698212146759, "epoch": 1.196078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 0.8951274530401899, "kl": 0.04232658073306084, "learning_rate": 7.53692451610441e-07, "loss": 0.0034, "num_tokens": 30943461.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.505446195602417, "sampling/importance_sampling_ratio/mean": 1.0001161098480225, "sampling/importance_sampling_ratio/min": 0.6627125144004822, "sampling/sampling_logp_difference/max": 0.4114140272140503, "sampling/sampling_logp_difference/mean": 0.014992992393672466, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 290.609375, "completions/mean_terminated_length": 290.609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4588989019393921, "epoch": 1.1973039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 0.9283088244233804, "kl": 0.03836783021688461, "learning_rate": 7.530783231795614e-07, "loss": 0.0558, "num_tokens": 30979052.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3670469522476196, "sampling/importance_sampling_ratio/mean": 0.9999618530273438, "sampling/importance_sampling_ratio/min": 0.6349077224731445, "sampling/sampling_logp_difference/max": 0.45427560806274414, "sampling/sampling_logp_difference/mean": 0.01504638884216547, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 239.1875, "completions/mean_terminated_length": 239.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.26992595195770264, "epoch": 1.1985294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.7095312256515067, "kl": 0.022371649742126465, "learning_rate": 7.524636810157188e-07, "loss": -0.0168, "num_tokens": 31012632.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5478365421295166, "sampling/importance_sampling_ratio/mean": 1.0002949237823486, "sampling/importance_sampling_ratio/min": 0.6262792944908142, "sampling/sampling_logp_difference/max": 0.467958927154541, "sampling/sampling_logp_difference/mean": 0.01214311271905899, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 187.21875, "completions/mean_terminated_length": 187.21875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3832342028617859, "epoch": 1.1997549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02012604263715308, "kl": 0.036140792071819305, "learning_rate": 7.518485263665977e-07, "loss": 0.0004, "num_tokens": 31041558.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.646314263343811, "sampling/importance_sampling_ratio/mean": 1.0002691745758057, "sampling/importance_sampling_ratio/min": 0.6513071656227112, "sampling/sampling_logp_difference/max": 0.4985389709472656, "sampling/sampling_logp_difference/mean": 0.014832520857453346, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 201.6875, "completions/mean_terminated_length": 201.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.5023150444030762, "epoch": 1.2009803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.1706872148884344, "kl": 0.050048939883708954, "learning_rate": 7.512328604809232e-07, "loss": 0.0054, "num_tokens": 31069074.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.588779330253601, "sampling/importance_sampling_ratio/mean": 0.9997115731239319, "sampling/importance_sampling_ratio/min": 0.7012655138969421, "sampling/sampling_logp_difference/max": 0.4629659652709961, "sampling/sampling_logp_difference/mean": 0.017660582438111305, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 270.71875, "completions/mean_terminated_length": 270.71875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.40667831897735596, "epoch": 1.2022058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.6439952382837856, "kl": 0.027503911405801773, "learning_rate": 7.506166846084579e-07, "loss": 0.0027, "num_tokens": 31104624.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.450819730758667, "sampling/importance_sampling_ratio/mean": 0.999941349029541, "sampling/importance_sampling_ratio/min": 0.6276686787605286, "sampling/sampling_logp_difference/max": 0.465742826461792, "sampling/sampling_logp_difference/mean": 0.014252717606723309, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 256.140625, "completions/mean_terminated_length": 256.140625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.41683530807495117, "epoch": 1.2034313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.711185580258691, "kl": 0.03274468705058098, "learning_rate": 7.5e-07, "loss": 0.0331, "num_tokens": 31150649.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996276497840881, "sampling/importance_sampling_ratio/min": 0.6253455281257629, "sampling/sampling_logp_difference/max": 0.854128360748291, "sampling/sampling_logp_difference/mean": 0.013821342028677464, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 175.765625, "completions/mean_terminated_length": 175.765625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4364689588546753, "epoch": 1.204656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.8481335327568599, "kl": 0.04834900051355362, "learning_rate": 7.493828079073801e-07, "loss": -0.0047, "num_tokens": 31175114.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3553777933120728, "sampling/importance_sampling_ratio/mean": 0.999114453792572, "sampling/importance_sampling_ratio/min": 0.6089012026786804, "sampling/sampling_logp_difference/max": 0.49609923362731934, "sampling/sampling_logp_difference/mean": 0.01572294905781746, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 148.953125, "completions/mean_terminated_length": 148.953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3382428288459778, "epoch": 1.2058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02191739361832906, "kl": 0.033510662615299225, "learning_rate": 7.487651095834588e-07, "loss": 0.0003, "num_tokens": 31199111.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.318970799446106, "sampling/importance_sampling_ratio/mean": 1.0000004768371582, "sampling/importance_sampling_ratio/min": 0.7020292282104492, "sampling/sampling_logp_difference/max": 0.35378026962280273, "sampling/sampling_logp_difference/mean": 0.012856241315603256, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 248.484375, "completions/mean_terminated_length": 248.484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4134531617164612, "epoch": 1.2071078431372548, "frac_reward_zero_std": 0.5, "grad_norm": 0.9921352611677013, "kl": 0.03773762285709381, "learning_rate": 7.481469062821251e-07, "loss": 0.018, "num_tokens": 31232518.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.593570590019226, "sampling/importance_sampling_ratio/mean": 1.0001540184020996, "sampling/importance_sampling_ratio/min": 0.6361991763114929, "sampling/sampling_logp_difference/max": 0.46597719192504883, "sampling/sampling_logp_difference/mean": 0.014097131788730621, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 238.328125, "completions/mean_terminated_length": 238.328125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4444141387939453, "epoch": 1.2083333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.7653636702913152, "kl": 0.03863909840583801, "learning_rate": 7.47528199258292e-07, "loss": -0.0156, "num_tokens": 31266283.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.300205945968628, "sampling/importance_sampling_ratio/mean": 0.999968409538269, "sampling/importance_sampling_ratio/min": 0.6790764331817627, "sampling/sampling_logp_difference/max": 0.387021541595459, "sampling/sampling_logp_difference/mean": 0.014662148430943489, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 180.59375, "completions/mean_terminated_length": 180.59375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3819793164730072, "epoch": 1.2095588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0203682283357998, "kl": 0.03312782198190689, "learning_rate": 7.469089897678957e-07, "loss": 0.0003, "num_tokens": 31291073.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 0.9995713829994202, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.014812866225838661, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 197.6875, "completions/mean_terminated_length": 197.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.5025283694267273, "epoch": 1.2107843137254901, "frac_reward_zero_std": 0.5, "grad_norm": 1.1871480858972723, "kl": 0.08525514602661133, "learning_rate": 7.462892790678925e-07, "loss": 0.023, "num_tokens": 31321357.0, "reward": 0.0625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5077842473983765, "sampling/importance_sampling_ratio/mean": 0.9996609687805176, "sampling/importance_sampling_ratio/min": 0.6306982636451721, "sampling/sampling_logp_difference/max": 0.46092772483825684, "sampling/sampling_logp_difference/mean": 0.015542639419436455, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 211.0625, "completions/mean_terminated_length": 211.0625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.5481150150299072, "epoch": 1.2120098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.1934147915780475, "kl": 0.043784111738204956, "learning_rate": 7.456690684162556e-07, "loss": 0.0195, "num_tokens": 31349329.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.484890341758728, "sampling/importance_sampling_ratio/mean": 1.0009729862213135, "sampling/importance_sampling_ratio/min": 0.6217290163040161, "sampling/sampling_logp_difference/max": 0.4752509593963623, "sampling/sampling_logp_difference/mean": 0.018647316843271255, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 193.734375, "completions/mean_terminated_length": 193.734375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.5381791591644287, "epoch": 1.213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.1130555604937038, "kl": 0.052825599908828735, "learning_rate": 7.450483590719736e-07, "loss": -0.0192, "num_tokens": 31390544.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4463319778442383, "sampling/importance_sampling_ratio/mean": 1.0000205039978027, "sampling/importance_sampling_ratio/min": 0.6269499063491821, "sampling/sampling_logp_difference/max": 0.4668886661529541, "sampling/sampling_logp_difference/mean": 0.017740219831466675, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 196.671875, "completions/mean_terminated_length": 196.671875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.5464645624160767, "epoch": 1.2144607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109620294982689, "kl": 0.04515843093395233, "learning_rate": 7.444271522950468e-07, "loss": 0.0005, "num_tokens": 31419883.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6023401021957397, "sampling/importance_sampling_ratio/mean": 1.0001468658447266, "sampling/importance_sampling_ratio/min": 0.662236213684082, "sampling/sampling_logp_difference/max": 0.4714651107788086, "sampling/sampling_logp_difference/mean": 0.016527537256479263, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.41974663734436035, "epoch": 1.215686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.023489431271263374, "kl": 0.033342257142066956, "learning_rate": 7.438054493464859e-07, "loss": 0.0003, "num_tokens": 31452619.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.549433708190918, "sampling/importance_sampling_ratio/mean": 1.000077724456787, "sampling/importance_sampling_ratio/min": 0.7335385084152222, "sampling/sampling_logp_difference/max": 0.43788957595825195, "sampling/sampling_logp_difference/mean": 0.01427594292908907, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 199.6875, "completions/mean_terminated_length": 199.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4310314655303955, "epoch": 1.2169117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.017950679150781197, "kl": 0.037778109312057495, "learning_rate": 7.431832514883081e-07, "loss": 0.0004, "num_tokens": 31481591.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3894187211990356, "sampling/importance_sampling_ratio/mean": 1.0000156164169312, "sampling/importance_sampling_ratio/min": 0.6943917870521545, "sampling/sampling_logp_difference/max": 0.3647189140319824, "sampling/sampling_logp_difference/mean": 0.014455149881541729, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 180.515625, "completions/mean_terminated_length": 180.515625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4299527406692505, "epoch": 1.218137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.8697049910147577, "kl": 0.03601614385843277, "learning_rate": 7.42560559983536e-07, "loss": 0.0272, "num_tokens": 31511480.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6359535455703735, "sampling/importance_sampling_ratio/mean": 1.0001451969146729, "sampling/importance_sampling_ratio/min": 0.33314183354377747, "sampling/sampling_logp_difference/max": 1.099186897277832, "sampling/sampling_logp_difference/mean": 0.014509855769574642, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 215.390625, "completions/mean_terminated_length": 215.390625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.5677188634872437, "epoch": 1.219362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.02080626355372966, "kl": 0.03738052397966385, "learning_rate": 7.419373760961939e-07, "loss": 0.0004, "num_tokens": 31545489.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.425569772720337, "sampling/importance_sampling_ratio/mean": 0.9999176263809204, "sampling/importance_sampling_ratio/min": 0.6939416527748108, "sampling/sampling_logp_difference/max": 0.36536741256713867, "sampling/sampling_logp_difference/mean": 0.017984483391046524, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 182.09375, "completions/mean_terminated_length": 182.09375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.44296029210090637, "epoch": 1.2205882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.8011630456171402, "kl": 0.04674481600522995, "learning_rate": 7.413137010913054e-07, "loss": 0.0142, "num_tokens": 31572871.0, "reward": -0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.8862242698669434, "sampling/importance_sampling_ratio/mean": 1.0002238750457764, "sampling/importance_sampling_ratio/min": 0.6639931797981262, "sampling/sampling_logp_difference/max": 0.6345770359039307, "sampling/sampling_logp_difference/mean": 0.015609879046678543, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 213.65625, "completions/mean_terminated_length": 213.65625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5575160384178162, "epoch": 1.221813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.9100488001028793, "kl": 0.051423899829387665, "learning_rate": 7.406895362348915e-07, "loss": -0.0139, "num_tokens": 31609169.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3358298540115356, "sampling/importance_sampling_ratio/mean": 0.9997066855430603, "sampling/importance_sampling_ratio/min": 0.6651027202606201, "sampling/sampling_logp_difference/max": 0.40781378746032715, "sampling/sampling_logp_difference/mean": 0.016100607812404633, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 200.9375, "completions/mean_terminated_length": 200.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4968104660511017, "epoch": 1.2230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.9513883949425093, "kl": 0.041767850518226624, "learning_rate": 7.400648827939671e-07, "loss": 0.0249, "num_tokens": 31640381.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5180104970932007, "sampling/importance_sampling_ratio/mean": 0.9998517036437988, "sampling/importance_sampling_ratio/min": 0.6954967379570007, "sampling/sampling_logp_difference/max": 0.417400598526001, "sampling/sampling_logp_difference/mean": 0.01566733419895172, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.5883033871650696, "epoch": 1.224264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7851977292537808, "kl": 0.04143001139163971, "learning_rate": 7.394397420365392e-07, "loss": -0.0112, "num_tokens": 31672597.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.619869589805603, "sampling/importance_sampling_ratio/mean": 1.0006663799285889, "sampling/importance_sampling_ratio/min": 0.7099029421806335, "sampling/sampling_logp_difference/max": 0.4823455810546875, "sampling/sampling_logp_difference/mean": 0.017243143171072006, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 202.171875, "completions/mean_terminated_length": 202.171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3977816104888916, "epoch": 1.2254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8975597494126939, "kl": 0.03055647574365139, "learning_rate": 7.388141152316038e-07, "loss": -0.0348, "num_tokens": 31701616.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4754674434661865, "sampling/importance_sampling_ratio/mean": 1.0006155967712402, "sampling/importance_sampling_ratio/min": 0.6483342051506042, "sampling/sampling_logp_difference/max": 0.4333488941192627, "sampling/sampling_logp_difference/mean": 0.013678541406989098, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 166.359375, "completions/mean_terminated_length": 166.359375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.39177465438842773, "epoch": 1.2267156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.022641037487017263, "kl": 0.039845582097768784, "learning_rate": 7.381880036491439e-07, "loss": 0.0004, "num_tokens": 31725047.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6429468393325806, "sampling/importance_sampling_ratio/mean": 1.0002089738845825, "sampling/importance_sampling_ratio/min": 0.7211967706680298, "sampling/sampling_logp_difference/max": 0.4964914321899414, "sampling/sampling_logp_difference/mean": 0.015742601826786995, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 229.71875, "completions/mean_terminated_length": 229.71875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4799444377422333, "epoch": 1.2279411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.021503236883838844, "kl": 0.03737611323595047, "learning_rate": 7.375614085601264e-07, "loss": 0.0004, "num_tokens": 31760325.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.653969407081604, "sampling/importance_sampling_ratio/mean": 0.9996362924575806, "sampling/importance_sampling_ratio/min": 0.6833867430686951, "sampling/sampling_logp_difference/max": 0.5031781196594238, "sampling/sampling_logp_difference/mean": 0.016141796484589577, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.464060515165329, "epoch": 1.2291666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.84347200364607, "kl": 0.06338971108198166, "learning_rate": 7.369343312364993e-07, "loss": 0.0107, "num_tokens": 31787125.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.615753173828125, "sampling/importance_sampling_ratio/mean": 1.0008978843688965, "sampling/importance_sampling_ratio/min": 0.6864622235298157, "sampling/sampling_logp_difference/max": 0.4798011779785156, "sampling/sampling_logp_difference/mean": 0.015821874141693115, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 248.40625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.48409995436668396, "epoch": 1.2303921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.7596962203726745, "kl": 0.03787213936448097, "learning_rate": 7.363067729511901e-07, "loss": -0.0062, "num_tokens": 31823455.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4660547971725464, "sampling/importance_sampling_ratio/mean": 0.9998261332511902, "sampling/importance_sampling_ratio/min": 0.6262744665145874, "sampling/sampling_logp_difference/max": 0.46796655654907227, "sampling/sampling_logp_difference/mean": 0.014787048101425171, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 239.28125, "completions/mean_terminated_length": 239.28125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4689325988292694, "epoch": 1.2316176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.01842944255374365, "kl": 0.031861260533332825, "learning_rate": 7.356787349781022e-07, "loss": 0.0003, "num_tokens": 31859329.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.603270173072815, "sampling/importance_sampling_ratio/mean": 1.00017249584198, "sampling/importance_sampling_ratio/min": 0.5374639630317688, "sampling/sampling_logp_difference/max": 0.6208934783935547, "sampling/sampling_logp_difference/mean": 0.016252703964710236, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 252.609375, "completions/mean_terminated_length": 252.609375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.5425338745117188, "epoch": 1.232843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.7354271231791148, "kl": 0.04957719147205353, "learning_rate": 7.350502185921131e-07, "loss": -0.0097, "num_tokens": 31893688.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4298754930496216, "sampling/importance_sampling_ratio/mean": 1.000070333480835, "sampling/importance_sampling_ratio/min": 0.6960698962211609, "sampling/sampling_logp_difference/max": 0.3623051643371582, "sampling/sampling_logp_difference/mean": 0.016065813601017, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 218.328125, "completions/mean_terminated_length": 218.328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.38892823457717896, "epoch": 1.2340686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.018745895758316127, "kl": 0.029336608946323395, "learning_rate": 7.344212250690711e-07, "loss": 0.0003, "num_tokens": 31921821.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.390149474143982, "sampling/importance_sampling_ratio/mean": 0.9999837875366211, "sampling/importance_sampling_ratio/min": 0.709245502948761, "sampling/sampling_logp_difference/max": 0.3435535430908203, "sampling/sampling_logp_difference/mean": 0.013726888224482536, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 227.421875, "completions/mean_terminated_length": 227.421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4669730067253113, "epoch": 1.2352941176470589, "frac_reward_zero_std": 0.5, "grad_norm": 1.0752558590695398, "kl": 0.04439125582575798, "learning_rate": 7.337917556857934e-07, "loss": 0.023, "num_tokens": 31957064.0, "reward": 0.6875, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.552883505821228, "sampling/importance_sampling_ratio/mean": 0.999682605266571, "sampling/importance_sampling_ratio/min": 0.6176310777664185, "sampling/sampling_logp_difference/max": 0.48186397552490234, "sampling/sampling_logp_difference/mean": 0.015025627799332142, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.5444422364234924, "epoch": 1.2365196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.7670486189730752, "kl": 0.06080947816371918, "learning_rate": 7.331618117200625e-07, "loss": -0.0069, "num_tokens": 31991240.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4498920440673828, "sampling/importance_sampling_ratio/mean": 0.9998273849487305, "sampling/importance_sampling_ratio/min": 0.6877607703208923, "sampling/sampling_logp_difference/max": 0.37431418895721436, "sampling/sampling_logp_difference/mean": 0.017048103734850883, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 232.96875, "completions/mean_terminated_length": 232.96875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.33657899498939514, "epoch": 1.2377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.019503945983382134, "kl": 0.030245546251535416, "learning_rate": 7.325313944506253e-07, "loss": 0.0003, "num_tokens": 32025974.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.439969539642334, "sampling/importance_sampling_ratio/mean": 1.0001139640808105, "sampling/importance_sampling_ratio/min": 0.6171409487724304, "sampling/sampling_logp_difference/max": 0.48265790939331055, "sampling/sampling_logp_difference/mean": 0.011354037560522556, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 213.109375, "completions/mean_terminated_length": 213.109375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3115587830543518, "epoch": 1.2389705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01905305858685132, "kl": 0.02667442336678505, "learning_rate": 7.319005051571885e-07, "loss": 0.0002, "num_tokens": 32054765.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.518096923828125, "sampling/importance_sampling_ratio/mean": 0.9998236894607544, "sampling/importance_sampling_ratio/min": 0.7007561326026917, "sampling/sampling_logp_difference/max": 0.41745758056640625, "sampling/sampling_logp_difference/mean": 0.012191656976938248, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 259.234375, "completions/mean_terminated_length": 259.234375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4942418336868286, "epoch": 1.2401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.022053430549242234, "kl": 0.03949630260467529, "learning_rate": 7.312691451204177e-07, "loss": 0.0004, "num_tokens": 32091644.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4972546100616455, "sampling/importance_sampling_ratio/mean": 1.0000078678131104, "sampling/importance_sampling_ratio/min": 0.6231526136398315, "sampling/sampling_logp_difference/max": 0.472963809967041, "sampling/sampling_logp_difference/mean": 0.015944818034768105, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 215.09375, "completions/mean_terminated_length": 215.09375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.35535722970962524, "epoch": 1.241421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7438780241335894, "kl": 0.04291858524084091, "learning_rate": 7.306373156219335e-07, "loss": 0.0085, "num_tokens": 32119954.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.298388123512268, "sampling/importance_sampling_ratio/mean": 0.9999449253082275, "sampling/importance_sampling_ratio/min": 0.6209536790847778, "sampling/sampling_logp_difference/max": 0.4764988422393799, "sampling/sampling_logp_difference/mean": 0.012562550604343414, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 294.015625, "completions/mean_terminated_length": 294.015625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.5063092112541199, "epoch": 1.2426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020977379822758387, "kl": 0.04146093875169754, "learning_rate": 7.300050179443099e-07, "loss": 0.0004, "num_tokens": 32158579.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3271468877792358, "sampling/importance_sampling_ratio/mean": 1.000190258026123, "sampling/importance_sampling_ratio/min": 0.6283326745033264, "sampling/sampling_logp_difference/max": 0.4646855592727661, "sampling/sampling_logp_difference/mean": 0.015025531873106956, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 291.453125, "completions/mean_terminated_length": 291.453125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.5707547664642334, "epoch": 1.2438725490196079, "frac_reward_zero_std": 0.25, "grad_norm": 1.2276676590333753, "kl": 0.04638542979955673, "learning_rate": 7.293722533710714e-07, "loss": 0.0127, "num_tokens": 32211472.0, "reward": 0.5, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7457033395767212, "sampling/importance_sampling_ratio/mean": 0.9994102716445923, "sampling/importance_sampling_ratio/min": 0.6368858218193054, "sampling/sampling_logp_difference/max": 0.5571575164794922, "sampling/sampling_logp_difference/mean": 0.01785561442375183, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 252.140625, "completions/mean_terminated_length": 252.140625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.43048757314682007, "epoch": 1.2450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.025821215352880268, "kl": 0.0424998477101326, "learning_rate": 7.287390231866893e-07, "loss": 0.0004, "num_tokens": 32243289.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6037300825119019, "sampling/importance_sampling_ratio/mean": 1.0000109672546387, "sampling/importance_sampling_ratio/min": 0.6250476837158203, "sampling/sampling_logp_difference/max": 0.472332239151001, "sampling/sampling_logp_difference/mean": 0.0146937882527709, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 253.90625, "completions/mean_terminated_length": 253.90625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.42356592416763306, "epoch": 1.2463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.6471867008574946, "kl": 0.05514194071292877, "learning_rate": 7.281053286765815e-07, "loss": 0.0035, "num_tokens": 32277747.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.464868426322937, "sampling/importance_sampling_ratio/mean": 1.0003128051757812, "sampling/importance_sampling_ratio/min": 0.703257143497467, "sampling/sampling_logp_difference/max": 0.38176536560058594, "sampling/sampling_logp_difference/mean": 0.01333966851234436, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 290.984375, "completions/mean_terminated_length": 290.984375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.49442100524902344, "epoch": 1.2475490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.6719842922456302, "kl": 0.03633398562669754, "learning_rate": 7.274711711271073e-07, "loss": 0.0373, "num_tokens": 32313634.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.445326566696167, "sampling/importance_sampling_ratio/mean": 0.999626100063324, "sampling/importance_sampling_ratio/min": 0.578313946723938, "sampling/sampling_logp_difference/max": 0.5476384162902832, "sampling/sampling_logp_difference/mean": 0.01476279366761446, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 216.84375, "completions/mean_terminated_length": 216.84375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.43948325514793396, "epoch": 1.2487745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.02685388704425198, "kl": 0.04969961941242218, "learning_rate": 7.268365518255665e-07, "loss": 0.0005, "num_tokens": 32343144.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4139961004257202, "sampling/importance_sampling_ratio/mean": 1.0000815391540527, "sampling/importance_sampling_ratio/min": 0.6436682939529419, "sampling/sampling_logp_difference/max": 0.44057178497314453, "sampling/sampling_logp_difference/mean": 0.015196645632386208, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 302.8125, "completions/mean_terminated_length": 302.8125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.5333965420722961, "epoch": 1.25, "frac_reward_zero_std": 0.75, "grad_norm": 0.5883894213341854, "kl": 0.06262734532356262, "learning_rate": 7.262014720601958e-07, "loss": 0.0001, "num_tokens": 32390236.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4623440504074097, "sampling/importance_sampling_ratio/mean": 1.0003504753112793, "sampling/importance_sampling_ratio/min": 0.6772839426994324, "sampling/sampling_logp_difference/max": 0.3896646499633789, "sampling/sampling_logp_difference/mean": 0.015126075595617294, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 307.109375, "completions/mean_terminated_length": 307.109375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4063161611557007, "epoch": 1.2512254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.021609550166476525, "kl": 0.03785210847854614, "learning_rate": 7.255659331201673e-07, "loss": 0.0004, "num_tokens": 32431299.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001342296600342, "sampling/importance_sampling_ratio/min": 0.6368669271469116, "sampling/sampling_logp_difference/max": 0.7420744895935059, "sampling/sampling_logp_difference/mean": 0.012437833473086357, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 265.765625, "completions/mean_terminated_length": 265.765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.5111925601959229, "epoch": 1.2524509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.6003328720428629, "kl": 0.06019875407218933, "learning_rate": 7.249299362955845e-07, "loss": 0.0213, "num_tokens": 32469892.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.318848729133606, "sampling/importance_sampling_ratio/mean": 1.0000724792480469, "sampling/importance_sampling_ratio/min": 0.679656982421875, "sampling/sampling_logp_difference/max": 0.386167049407959, "sampling/sampling_logp_difference/mean": 0.01651834324002266, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 253.671875, "completions/mean_terminated_length": 253.671875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5959911346435547, "epoch": 1.2536764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 0.9339709076519227, "kl": 0.08829143643379211, "learning_rate": 7.242934828774808e-07, "loss": -0.0129, "num_tokens": 32509711.0, "reward": 0.125, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6276863813400269, "sampling/importance_sampling_ratio/mean": 1.0000534057617188, "sampling/importance_sampling_ratio/min": 0.6988682150840759, "sampling/sampling_logp_difference/max": 0.4871596097946167, "sampling/sampling_logp_difference/mean": 0.018192298710346222, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.5745341777801514, "epoch": 1.2549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7087028040316709, "kl": 0.053980231285095215, "learning_rate": 7.236565741578162e-07, "loss": 0.0365, "num_tokens": 32551031.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996329545974731, "sampling/importance_sampling_ratio/min": 0.6725244522094727, "sampling/sampling_logp_difference/max": 0.850208044052124, "sampling/sampling_logp_difference/mean": 0.016945503652095795, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 269.203125, "completions/mean_terminated_length": 269.203125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.39986565709114075, "epoch": 1.2561274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.025664668018526356, "kl": 0.040328651666641235, "learning_rate": 7.230192114294753e-07, "loss": 0.0004, "num_tokens": 32585988.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5961456298828125, "sampling/importance_sampling_ratio/mean": 0.9999529719352722, "sampling/importance_sampling_ratio/min": 0.6376746892929077, "sampling/sampling_logp_difference/max": 0.4675917625427246, "sampling/sampling_logp_difference/mean": 0.013972658663988113, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3309788107872009, "epoch": 1.2573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01775832964892254, "kl": 0.024740155786275864, "learning_rate": 7.223813959862638e-07, "loss": 0.0002, "num_tokens": 32624676.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4058010578155518, "sampling/importance_sampling_ratio/mean": 1.0001771450042725, "sampling/importance_sampling_ratio/min": 0.6491348147392273, "sampling/sampling_logp_difference/max": 0.432114839553833, "sampling/sampling_logp_difference/mean": 0.01120903342962265, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 253.078125, "completions/mean_terminated_length": 253.078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.41706496477127075, "epoch": 1.258578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.026393378659487716, "kl": 0.04604463651776314, "learning_rate": 7.217431291229067e-07, "loss": 0.0004, "num_tokens": 32660809.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5524452924728394, "sampling/importance_sampling_ratio/mean": 1.0001442432403564, "sampling/importance_sampling_ratio/min": 0.6628890633583069, "sampling/sampling_logp_difference/max": 0.4398312568664551, "sampling/sampling_logp_difference/mean": 0.013960368931293488, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 208.046875, "completions/mean_terminated_length": 208.046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.36943647265434265, "epoch": 1.2598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.03732666976560762, "kl": 0.04475586861371994, "learning_rate": 7.211044121350454e-07, "loss": 0.0004, "num_tokens": 32690396.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2994472980499268, "sampling/importance_sampling_ratio/mean": 0.9995710849761963, "sampling/importance_sampling_ratio/min": 0.6124758124351501, "sampling/sampling_logp_difference/max": 0.4902458190917969, "sampling/sampling_logp_difference/mean": 0.013447067700326443, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 247.09375, "completions/mean_terminated_length": 247.09375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.42932724952697754, "epoch": 1.2610294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02541340858929853, "kl": 0.03734192997217178, "learning_rate": 7.204652463192347e-07, "loss": 0.0004, "num_tokens": 32726786.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3745006322860718, "sampling/importance_sampling_ratio/mean": 0.9998223781585693, "sampling/importance_sampling_ratio/min": 0.6333354115486145, "sampling/sampling_logp_difference/max": 0.4567551612854004, "sampling/sampling_logp_difference/mean": 0.014694184064865112, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 300.59375, "completions/mean_terminated_length": 300.59375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.46342897415161133, "epoch": 1.2622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02649329390311697, "kl": 0.052085746079683304, "learning_rate": 7.198256329729411e-07, "loss": 0.0005, "num_tokens": 32768632.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.527761459350586, "sampling/importance_sampling_ratio/mean": 1.000121831893921, "sampling/importance_sampling_ratio/min": 0.5563894510269165, "sampling/sampling_logp_difference/max": 0.5862867832183838, "sampling/sampling_logp_difference/mean": 0.014383465051651001, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 225.984375, "completions/mean_terminated_length": 225.984375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3839717507362366, "epoch": 1.2634803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7988206397905827, "kl": 0.043448030948638916, "learning_rate": 7.191855733945386e-07, "loss": -0.0103, "num_tokens": 32808407.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9693922996520996, "sampling/importance_sampling_ratio/mean": 1.0001676082611084, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.677725076675415, "sampling/sampling_logp_difference/mean": 0.013649694621562958, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 218.484375, "completions/mean_terminated_length": 218.484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4352691173553467, "epoch": 1.2647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0543315479902784, "kl": 0.0602213591337204, "learning_rate": 7.185450688833083e-07, "loss": 0.041, "num_tokens": 32839110.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3956884145736694, "sampling/importance_sampling_ratio/mean": 0.9996954202651978, "sampling/importance_sampling_ratio/min": 0.6771960854530334, "sampling/sampling_logp_difference/max": 0.38979434967041016, "sampling/sampling_logp_difference/mean": 0.015575871802866459, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 266.15625, "completions/mean_terminated_length": 266.15625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4167899787425995, "epoch": 1.2659313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.025274840811054574, "kl": 0.03985028713941574, "learning_rate": 7.179041207394331e-07, "loss": 0.0004, "num_tokens": 32873392.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5278154611587524, "sampling/importance_sampling_ratio/mean": 0.9999694228172302, "sampling/importance_sampling_ratio/min": 0.6771520972251892, "sampling/sampling_logp_difference/max": 0.42383885383605957, "sampling/sampling_logp_difference/mean": 0.014345312491059303, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4263748228549957, "epoch": 1.267156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.023683970665192327, "kl": 0.03402005881071091, "learning_rate": 7.172627302639975e-07, "loss": 0.0003, "num_tokens": 32912480.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5070692300796509, "sampling/importance_sampling_ratio/mean": 0.9993304014205933, "sampling/importance_sampling_ratio/min": 0.5725120902061462, "sampling/sampling_logp_difference/max": 0.5577214956283569, "sampling/sampling_logp_difference/mean": 0.014584648422896862, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 283.515625, "completions/mean_terminated_length": 283.515625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4535956084728241, "epoch": 1.2683823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.031473450565118265, "kl": 0.03707154840230942, "learning_rate": 7.166208987589836e-07, "loss": 0.0004, "num_tokens": 32946033.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4811121225357056, "sampling/importance_sampling_ratio/mean": 1.0002422332763672, "sampling/importance_sampling_ratio/min": 0.5675489902496338, "sampling/sampling_logp_difference/max": 0.5664281845092773, "sampling/sampling_logp_difference/mean": 0.01619875617325306, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 241.765625, "completions/mean_terminated_length": 241.765625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2822660803794861, "epoch": 1.2696078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.022556727140358727, "kl": 0.02592044323682785, "learning_rate": 7.159786275272686e-07, "loss": 0.0003, "num_tokens": 32976562.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.597345232963562, "sampling/importance_sampling_ratio/mean": 1.0000855922698975, "sampling/importance_sampling_ratio/min": 0.6534473299980164, "sampling/sampling_logp_difference/max": 0.46834301948547363, "sampling/sampling_logp_difference/mean": 0.012253960594534874, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 212.359375, "completions/mean_terminated_length": 212.359375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.37916967272758484, "epoch": 1.2708333333333333, "frac_reward_zero_std": 0.75, "grad_norm": 0.7746878225115345, "kl": 0.04545515775680542, "learning_rate": 7.153359178726221e-07, "loss": 0.0251, "num_tokens": 33004969.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5469253063201904, "sampling/importance_sampling_ratio/mean": 0.9998739957809448, "sampling/importance_sampling_ratio/min": 0.6379643082618713, "sampling/sampling_logp_difference/max": 0.44947290420532227, "sampling/sampling_logp_difference/mean": 0.015152151696383953, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 224.0625, "completions/mean_terminated_length": 224.0625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.453823447227478, "epoch": 1.2720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.027001662920450437, "kl": 0.039575982838869095, "learning_rate": 7.146927710997046e-07, "loss": 0.0004, "num_tokens": 33036461.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3945215940475464, "sampling/importance_sampling_ratio/mean": 0.9998276233673096, "sampling/importance_sampling_ratio/min": 0.6896499991416931, "sampling/sampling_logp_difference/max": 0.37157106399536133, "sampling/sampling_logp_difference/mean": 0.01585349440574646, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 253.953125, "completions/mean_terminated_length": 253.953125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.32050231099128723, "epoch": 1.2732843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.02010016021050216, "kl": 0.029850102961063385, "learning_rate": 7.140491885140628e-07, "loss": 0.0003, "num_tokens": 33068666.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5174421072006226, "sampling/importance_sampling_ratio/mean": 1.000077247619629, "sampling/importance_sampling_ratio/min": 0.6370596289634705, "sampling/sampling_logp_difference/max": 0.45089197158813477, "sampling/sampling_logp_difference/mean": 0.013122981414198875, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 246.8125, "completions/mean_terminated_length": 246.8125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.425956130027771, "epoch": 1.2745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.022397279953500417, "kl": 0.03901214897632599, "learning_rate": 7.134051714221286e-07, "loss": 0.0003, "num_tokens": 33104446.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.581176996231079, "sampling/importance_sampling_ratio/mean": 1.0002460479736328, "sampling/importance_sampling_ratio/min": 0.616258978843689, "sampling/sampling_logp_difference/max": 0.4840879440307617, "sampling/sampling_logp_difference/mean": 0.015681080520153046, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 188.234375, "completions/mean_terminated_length": 188.234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2762240469455719, "epoch": 1.2757352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.024447707372076348, "kl": 0.02437257394194603, "learning_rate": 7.127607211312162e-07, "loss": 0.0002, "num_tokens": 33130941.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4655163288116455, "sampling/importance_sampling_ratio/mean": 1.0001275539398193, "sampling/importance_sampling_ratio/min": 0.6214760541915894, "sampling/sampling_logp_difference/max": 0.47565793991088867, "sampling/sampling_logp_difference/mean": 0.012499706819653511, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 230.859375, "completions/mean_terminated_length": 230.859375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.302249550819397, "epoch": 1.2769607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.02441438786301167, "kl": 0.026303227990865707, "learning_rate": 7.121158389495185e-07, "loss": 0.0002, "num_tokens": 33161908.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4328745603561401, "sampling/importance_sampling_ratio/mean": 0.9998009204864502, "sampling/importance_sampling_ratio/min": 0.6068763732910156, "sampling/sampling_logp_difference/max": 0.49943017959594727, "sampling/sampling_logp_difference/mean": 0.01246584951877594, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 252.6875, "completions/mean_terminated_length": 252.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4117159843444824, "epoch": 1.278186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.9586207423221864, "kl": 0.04804814234375954, "learning_rate": 7.114705261861061e-07, "loss": 0.0135, "num_tokens": 33202896.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.8996422290802002, "sampling/importance_sampling_ratio/mean": 0.9999984502792358, "sampling/importance_sampling_ratio/min": 0.5023150444030762, "sampling/sampling_logp_difference/max": 0.6885278224945068, "sampling/sampling_logp_difference/mean": 0.016012828797101974, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 243.59375, "completions/mean_terminated_length": 243.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2845577597618103, "epoch": 1.2794117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.8391865100951481, "kl": 0.03303593397140503, "learning_rate": 7.108247841509222e-07, "loss": -0.034, "num_tokens": 33231094.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.627860426902771, "sampling/importance_sampling_ratio/mean": 0.9997678995132446, "sampling/importance_sampling_ratio/min": 0.644490122795105, "sampling/sampling_logp_difference/max": 0.48726654052734375, "sampling/sampling_logp_difference/mean": 0.011671999469399452, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 182.859375, "completions/mean_terminated_length": 182.859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3172720968723297, "epoch": 1.280637254901961, "frac_reward_zero_std": 0.25, "grad_norm": 1.5413983239586968, "kl": 0.05567832291126251, "learning_rate": 7.101786141547828e-07, "loss": 0.0256, "num_tokens": 33258365.0, "reward": 0.0, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6092875003814697, "sampling/importance_sampling_ratio/mean": 1.000196933746338, "sampling/importance_sampling_ratio/min": 0.6497927308082581, "sampling/sampling_logp_difference/max": 0.47579145431518555, "sampling/sampling_logp_difference/mean": 0.013540423475205898, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 218.59375, "completions/mean_terminated_length": 218.59375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.32802873849868774, "epoch": 1.281862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.025774050768263553, "kl": 0.024461662396788597, "learning_rate": 7.095320175093718e-07, "loss": 0.0002, "num_tokens": 33287971.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.550926923751831, "sampling/importance_sampling_ratio/mean": 0.9997495412826538, "sampling/importance_sampling_ratio/min": 0.6130277514457703, "sampling/sampling_logp_difference/max": 0.48934507369995117, "sampling/sampling_logp_difference/mean": 0.013812687247991562, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 158.9375, "completions/mean_terminated_length": 158.9375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.24623757600784302, "epoch": 1.2830882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.031035160236915355, "kl": 0.029723387211561203, "learning_rate": 7.088849955272396e-07, "loss": 0.0003, "num_tokens": 33313935.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3437520265579224, "sampling/importance_sampling_ratio/mean": 0.9991786479949951, "sampling/importance_sampling_ratio/min": 0.6279453039169312, "sampling/sampling_logp_difference/max": 0.4653022289276123, "sampling/sampling_logp_difference/mean": 0.011917706578969955, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 157.765625, "completions/mean_terminated_length": 157.765625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3262321949005127, "epoch": 1.284313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.03942257101959648, "kl": 0.041421011090278625, "learning_rate": 7.082375495217995e-07, "loss": 0.0004, "num_tokens": 33339824.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4285696744918823, "sampling/importance_sampling_ratio/mean": 1.0001627206802368, "sampling/importance_sampling_ratio/min": 0.6262629628181458, "sampling/sampling_logp_difference/max": 0.4679849147796631, "sampling/sampling_logp_difference/mean": 0.015121504664421082, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 196.671875, "completions/mean_terminated_length": 196.671875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.32499948143959045, "epoch": 1.2855392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.03905924470577792, "kl": 0.031079092994332314, "learning_rate": 7.075896808073263e-07, "loss": 0.0003, "num_tokens": 33370571.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5940988063812256, "sampling/importance_sampling_ratio/mean": 1.000136375427246, "sampling/importance_sampling_ratio/min": 0.6450546383857727, "sampling/sampling_logp_difference/max": 0.46630859375, "sampling/sampling_logp_difference/mean": 0.014183073304593563, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 189.15625, "completions/mean_terminated_length": 189.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.33075258135795593, "epoch": 1.2867647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.8668856395806319, "kl": 0.052480727434158325, "learning_rate": 7.069413906989523e-07, "loss": 0.013, "num_tokens": 33400869.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5986402034759521, "sampling/importance_sampling_ratio/mean": 0.9995260834693909, "sampling/importance_sampling_ratio/min": 0.6558920741081238, "sampling/sampling_logp_difference/max": 0.46915340423583984, "sampling/sampling_logp_difference/mean": 0.013863028958439827, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 188.609375, "completions/mean_terminated_length": 188.609375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3841259479522705, "epoch": 1.2879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.9909333990221172, "kl": 0.05324501544237137, "learning_rate": 7.062926805126652e-07, "loss": 0.0465, "num_tokens": 33431148.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5077095031738281, "sampling/importance_sampling_ratio/mean": 1.0002236366271973, "sampling/importance_sampling_ratio/min": 0.6441048383712769, "sampling/sampling_logp_difference/max": 0.4398937225341797, "sampling/sampling_logp_difference/mean": 0.016419682651758194, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 180.4375, "completions/mean_terminated_length": 180.4375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.390139639377594, "epoch": 1.2892156862745099, "frac_reward_zero_std": 0.75, "grad_norm": 1.0680609818372886, "kl": 0.04708288609981537, "learning_rate": 7.056435515653058e-07, "loss": -0.048, "num_tokens": 33459352.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4656306505203247, "sampling/importance_sampling_ratio/mean": 1.0000035762786865, "sampling/importance_sampling_ratio/min": 0.5664933919906616, "sampling/sampling_logp_difference/max": 0.5682897567749023, "sampling/sampling_logp_difference/mean": 0.01614879071712494, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 179.28125, "completions/mean_terminated_length": 179.28125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.38631048798561096, "epoch": 1.2904411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.4400635236264094, "kl": 0.06664882600307465, "learning_rate": 7.049940051745646e-07, "loss": 0.0118, "num_tokens": 33486490.0, "reward": 0.21875, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4916131496429443, "sampling/importance_sampling_ratio/mean": 0.9997199177742004, "sampling/importance_sampling_ratio/min": 0.6213639974594116, "sampling/sampling_logp_difference/max": 0.47583818435668945, "sampling/sampling_logp_difference/mean": 0.01591946929693222, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 188.8125, "completions/mean_terminated_length": 188.8125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3945600986480713, "epoch": 1.2916666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 1.2811574243731971, "kl": 0.060895487666130066, "learning_rate": 7.043440426589795e-07, "loss": -0.0234, "num_tokens": 33520894.0, "reward": -0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5335569381713867, "sampling/importance_sampling_ratio/mean": 1.0000795125961304, "sampling/importance_sampling_ratio/min": 0.6764960885047913, "sampling/sampling_logp_difference/max": 0.42758989334106445, "sampling/sampling_logp_difference/mean": 0.015397395007312298, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 169.578125, "completions/mean_terminated_length": 169.578125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2863377630710602, "epoch": 1.2928921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.9757538928484449, "kl": 0.05878036841750145, "learning_rate": 7.036936653379335e-07, "loss": 0.0069, "num_tokens": 33550339.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.422853708267212, "sampling/importance_sampling_ratio/mean": 0.9996306300163269, "sampling/importance_sampling_ratio/min": 0.6298396587371826, "sampling/sampling_logp_difference/max": 0.46229004859924316, "sampling/sampling_logp_difference/mean": 0.013411764986813068, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 195.890625, "completions/mean_terminated_length": 195.890625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3994911313056946, "epoch": 1.2941176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 0.8204277477205849, "kl": 0.06238219887018204, "learning_rate": 7.030428745316512e-07, "loss": 0.015, "num_tokens": 33585308.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5112587213516235, "sampling/importance_sampling_ratio/mean": 1.0002164840698242, "sampling/importance_sampling_ratio/min": 0.6720823645591736, "sampling/sampling_logp_difference/max": 0.41294288635253906, "sampling/sampling_logp_difference/mean": 0.015088235959410667, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.35088345408439636, "epoch": 1.295343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8394822331891233, "kl": 0.05312206968665123, "learning_rate": 7.023916715611968e-07, "loss": 0.046, "num_tokens": 33618166.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.642237901687622, "sampling/importance_sampling_ratio/mean": 1.000025987625122, "sampling/importance_sampling_ratio/min": 0.5948571562767029, "sampling/sampling_logp_difference/max": 0.5194339752197266, "sampling/sampling_logp_difference/mean": 0.014569773338735104, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 198.328125, "completions/mean_terminated_length": 198.328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.43649303913116455, "epoch": 1.2965686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.06133202697411687, "kl": 0.06992170214653015, "learning_rate": 7.017400577484712e-07, "loss": 0.0007, "num_tokens": 33647435.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4692312479019165, "sampling/importance_sampling_ratio/mean": 0.9996936321258545, "sampling/importance_sampling_ratio/min": 0.26469555497169495, "sampling/sampling_logp_difference/max": 1.3291749954223633, "sampling/sampling_logp_difference/mean": 0.0163760744035244, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 167.84375, "completions/mean_terminated_length": 167.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3744012117385864, "epoch": 1.2977941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.7941412030978372, "kl": 0.04241356998682022, "learning_rate": 7.010880344162086e-07, "loss": 0.019, "num_tokens": 33677073.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5307191610336304, "sampling/importance_sampling_ratio/mean": 0.9998902678489685, "sampling/importance_sampling_ratio/min": 0.6202341318130493, "sampling/sampling_logp_difference/max": 0.4776582717895508, "sampling/sampling_logp_difference/mean": 0.01614062674343586, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 192.03125, "completions/mean_terminated_length": 192.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.36728131771087646, "epoch": 1.2990196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.9322385344088944, "kl": 0.03659145534038544, "learning_rate": 7.004356028879758e-07, "loss": -0.0027, "num_tokens": 33709619.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4058219194412231, "sampling/importance_sampling_ratio/mean": 0.9997914433479309, "sampling/importance_sampling_ratio/min": 0.5355716347694397, "sampling/sampling_logp_difference/max": 0.6244206428527832, "sampling/sampling_logp_difference/mean": 0.01563429832458496, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 151.640625, "completions/mean_terminated_length": 151.640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3456276059150696, "epoch": 1.3002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.1141161126383432, "kl": 0.05363607406616211, "learning_rate": 6.99782764488167e-07, "loss": 0.0238, "num_tokens": 33738236.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5828100442886353, "sampling/importance_sampling_ratio/mean": 0.9999970197677612, "sampling/importance_sampling_ratio/min": 0.6303369998931885, "sampling/sampling_logp_difference/max": 0.4615006446838379, "sampling/sampling_logp_difference/mean": 0.015090061351656914, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 171.609375, "completions/mean_terminated_length": 171.609375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.27703601121902466, "epoch": 1.3014705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.026881586740977955, "kl": 0.029022444039583206, "learning_rate": 6.991295205420027e-07, "loss": 0.0003, "num_tokens": 33766995.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4080665111541748, "sampling/importance_sampling_ratio/mean": 1.0002038478851318, "sampling/importance_sampling_ratio/min": 0.583820641040802, "sampling/sampling_logp_difference/max": 0.5381613969802856, "sampling/sampling_logp_difference/mean": 0.011404757387936115, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 179.984375, "completions/mean_terminated_length": 179.984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4201258420944214, "epoch": 1.3026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.052123880280025646, "kl": 0.0661260113120079, "learning_rate": 6.984758723755272e-07, "loss": 0.0006, "num_tokens": 33797682.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6265360116958618, "sampling/importance_sampling_ratio/mean": 1.0000171661376953, "sampling/importance_sampling_ratio/min": 0.7134343981742859, "sampling/sampling_logp_difference/max": 0.486452579498291, "sampling/sampling_logp_difference/mean": 0.016265347599983215, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 184.578125, "completions/mean_terminated_length": 184.578125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4626278877258301, "epoch": 1.303921568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.41291532247678, "kl": 0.07594802230596542, "learning_rate": 6.978218213156044e-07, "loss": -0.034, "num_tokens": 33824631.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.7275714874267578, "sampling/importance_sampling_ratio/mean": 1.000497817993164, "sampling/importance_sampling_ratio/min": 0.6479448676109314, "sampling/sampling_logp_difference/max": 0.5467166900634766, "sampling/sampling_logp_difference/mean": 0.017567459493875504, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.42885833978652954, "epoch": 1.3051470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.268876871042677, "kl": 0.06140081584453583, "learning_rate": 6.971673686899169e-07, "loss": 0.0301, "num_tokens": 33855047.0, "reward": 0.28125, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.469535231590271, "sampling/importance_sampling_ratio/mean": 0.9998229742050171, "sampling/importance_sampling_ratio/min": 0.6435920000076294, "sampling/sampling_logp_difference/max": 0.440690279006958, "sampling/sampling_logp_difference/mean": 0.01758875697851181, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 159.421875, "completions/mean_terminated_length": 159.421875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.36572587490081787, "epoch": 1.3063725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.047132954323472774, "kl": 0.08620236068964005, "learning_rate": 6.965125158269618e-07, "loss": 0.0007, "num_tokens": 33883106.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9998907446861267, "sampling/importance_sampling_ratio/min": 0.6875842809677124, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.015279294922947884, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.39573433995246887, "epoch": 1.3075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0250018360360977, "kl": 0.06333038210868835, "learning_rate": 6.958572640560491e-07, "loss": 0.0092, "num_tokens": 33917190.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5846731662750244, "sampling/importance_sampling_ratio/mean": 1.0000784397125244, "sampling/importance_sampling_ratio/min": 0.6348634958267212, "sampling/sampling_logp_difference/max": 0.46037817001342773, "sampling/sampling_logp_difference/mean": 0.015109268017113209, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 169.53125, "completions/mean_terminated_length": 169.53125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2902347147464752, "epoch": 1.3088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.999846818846809, "kl": 0.04580019414424896, "learning_rate": 6.952016147072981e-07, "loss": 0.012, "num_tokens": 33943336.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6195276975631714, "sampling/importance_sampling_ratio/mean": 1.0002738237380981, "sampling/importance_sampling_ratio/min": 0.6298382878303528, "sampling/sampling_logp_difference/max": 0.4821345806121826, "sampling/sampling_logp_difference/mean": 0.013744648545980453, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 168.203125, "completions/mean_terminated_length": 168.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4487200379371643, "epoch": 1.3100490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 1.370408774743456, "kl": 0.08198067545890808, "learning_rate": 6.945455691116358e-07, "loss": 0.007, "num_tokens": 33972485.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4868720769882202, "sampling/importance_sampling_ratio/mean": 1.000361442565918, "sampling/importance_sampling_ratio/min": 0.6132069826126099, "sampling/sampling_logp_difference/max": 0.48905277252197266, "sampling/sampling_logp_difference/mean": 0.01734788343310356, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 163.34375, "completions/mean_terminated_length": 163.34375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.32821619510650635, "epoch": 1.3112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.024778477204538275, "kl": 0.03450584411621094, "learning_rate": 6.938891286007928e-07, "loss": 0.0003, "num_tokens": 34007067.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000243067741394, "sampling/importance_sampling_ratio/min": 0.6840278506278992, "sampling/sampling_logp_difference/max": 0.7153477668762207, "sampling/sampling_logp_difference/mean": 0.01339616347104311, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 153.203125, "completions/mean_terminated_length": 153.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.39119040966033936, "epoch": 1.3125, "frac_reward_zero_std": 0.75, "grad_norm": 1.0528701708787427, "kl": 0.056332677602767944, "learning_rate": 6.932322945073023e-07, "loss": 0.0085, "num_tokens": 34031000.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4968187808990479, "sampling/importance_sampling_ratio/mean": 1.0003398656845093, "sampling/importance_sampling_ratio/min": 0.6802636384963989, "sampling/sampling_logp_difference/max": 0.40334200859069824, "sampling/sampling_logp_difference/mean": 0.01601910963654518, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3235864043235779, "epoch": 1.3137254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.02920336386591169, "kl": 0.03633510693907738, "learning_rate": 6.925750681644953e-07, "loss": 0.0004, "num_tokens": 34056744.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.616931676864624, "sampling/importance_sampling_ratio/mean": 0.9995802044868469, "sampling/importance_sampling_ratio/min": 0.6068379282951355, "sampling/sampling_logp_difference/max": 0.4994935989379883, "sampling/sampling_logp_difference/mean": 0.013510503806173801, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 169.90625, "completions/mean_terminated_length": 169.90625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2885391414165497, "epoch": 1.3149509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.018540908878206534, "kl": 0.023091096431016922, "learning_rate": 6.919174509065003e-07, "loss": 0.0002, "num_tokens": 34094530.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3570696115493774, "sampling/importance_sampling_ratio/mean": 1.000139832496643, "sampling/importance_sampling_ratio/min": 0.7576091289520264, "sampling/sampling_logp_difference/max": 0.3053276538848877, "sampling/sampling_logp_difference/mean": 0.011862678453326225, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 218.53125, "completions/mean_terminated_length": 218.53125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.379422664642334, "epoch": 1.3161764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.6157068019165124, "kl": 0.06551677733659744, "learning_rate": 6.91259444068238e-07, "loss": -0.0096, "num_tokens": 34126740.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.354239821434021, "sampling/importance_sampling_ratio/mean": 0.999758243560791, "sampling/importance_sampling_ratio/min": 0.6220086216926575, "sampling/sampling_logp_difference/max": 0.47480130195617676, "sampling/sampling_logp_difference/mean": 0.013810522854328156, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 211.46875, "completions/mean_terminated_length": 211.46875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.549732506275177, "epoch": 1.3174019607843137, "frac_reward_zero_std": 0.25, "grad_norm": 1.7921593436581686, "kl": 0.0827806368470192, "learning_rate": 6.906010489854209e-07, "loss": -0.0191, "num_tokens": 34162354.0, "reward": -0.1875, "reward_std": 0.6116957664489746, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4058430194854736, "sampling/importance_sampling_ratio/mean": 0.9995255470275879, "sampling/importance_sampling_ratio/min": 0.6106529235839844, "sampling/sampling_logp_difference/max": 0.4932265281677246, "sampling/sampling_logp_difference/mean": 0.018393494188785553, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.5191712379455566, "epoch": 1.3186274509803921, "frac_reward_zero_std": 0.5, "grad_norm": 1.315060820072996, "kl": 0.0638878345489502, "learning_rate": 6.899422669945493e-07, "loss": -0.0188, "num_tokens": 34191818.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.7806079387664795, "sampling/importance_sampling_ratio/mean": 0.9991124272346497, "sampling/importance_sampling_ratio/min": 0.627777636051178, "sampling/sampling_logp_difference/max": 0.5769548416137695, "sampling/sampling_logp_difference/mean": 0.01793564110994339, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4007861614227295, "epoch": 1.3198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.029734999155876355, "kl": 0.036175504326820374, "learning_rate": 6.892830994329088e-07, "loss": 0.0004, "num_tokens": 34221402.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2835105657577515, "sampling/importance_sampling_ratio/mean": 1.0006650686264038, "sampling/importance_sampling_ratio/min": 0.7054614424705505, "sampling/sampling_logp_difference/max": 0.34890317916870117, "sampling/sampling_logp_difference/mean": 0.015294120647013187, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 214.078125, "completions/mean_terminated_length": 214.078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.5529769659042358, "epoch": 1.321078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.2654157102524206, "kl": 0.0877370834350586, "learning_rate": 6.886235476385681e-07, "loss": -0.0188, "num_tokens": 34253039.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.2814871072769165, "sampling/importance_sampling_ratio/mean": 0.9998075366020203, "sampling/importance_sampling_ratio/min": 0.7272739410400391, "sampling/sampling_logp_difference/max": 0.3184521198272705, "sampling/sampling_logp_difference/mean": 0.01777786575257778, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 146.078125, "completions/mean_terminated_length": 146.078125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.46738529205322266, "epoch": 1.3223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.9922101852402277, "kl": 0.07144574820995331, "learning_rate": 6.879636129503751e-07, "loss": -0.004, "num_tokens": 34282676.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6232999563217163, "sampling/importance_sampling_ratio/mean": 1.000260353088379, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.48446106910705566, "sampling/sampling_logp_difference/mean": 0.017098616808652878, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 160.453125, "completions/mean_terminated_length": 160.453125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.447221040725708, "epoch": 1.3235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.03038422686122859, "kl": 0.034434664994478226, "learning_rate": 6.87303296707956e-07, "loss": 0.0004, "num_tokens": 34314769.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0005841255187988, "sampling/importance_sampling_ratio/min": 0.6435648798942566, "sampling/sampling_logp_difference/max": 0.440732479095459, "sampling/sampling_logp_difference/mean": 0.015548791736364365, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4014641046524048, "epoch": 1.3247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.2977414891724848, "kl": 0.060253456234931946, "learning_rate": 6.866426002517105e-07, "loss": -0.0312, "num_tokens": 34339185.0, "reward": -0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4831007719039917, "sampling/importance_sampling_ratio/mean": 0.9999460577964783, "sampling/importance_sampling_ratio/min": 0.6402031183242798, "sampling/sampling_logp_difference/max": 0.445969820022583, "sampling/sampling_logp_difference/mean": 0.014733843505382538, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 174.703125, "completions/mean_terminated_length": 174.703125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3389003872871399, "epoch": 1.3259803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.1914942711509917, "kl": 0.054510023444890976, "learning_rate": 6.859815249228105e-07, "loss": 0.0098, "num_tokens": 34367022.0, "reward": 0.625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5194233655929565, "sampling/importance_sampling_ratio/mean": 1.0001471042633057, "sampling/importance_sampling_ratio/min": 0.6015815138816833, "sampling/sampling_logp_difference/max": 0.5081932544708252, "sampling/sampling_logp_difference/mean": 0.012282421812415123, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 156.71875, "completions/mean_terminated_length": 156.71875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.4586590826511383, "epoch": 1.3272058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 1.2045103896785339, "kl": 0.041404061019420624, "learning_rate": 6.853200720631972e-07, "loss": 0.0132, "num_tokens": 34392620.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.8330409526824951, "sampling/importance_sampling_ratio/mean": 1.0000096559524536, "sampling/importance_sampling_ratio/min": 0.6203129291534424, "sampling/sampling_logp_difference/max": 0.6059763431549072, "sampling/sampling_logp_difference/mean": 0.01719982549548149, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 158.015625, "completions/mean_terminated_length": 158.015625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3859842121601105, "epoch": 1.3284313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.02756333372447915, "kl": 0.04340790957212448, "learning_rate": 6.846582430155781e-07, "loss": 0.0004, "num_tokens": 34417645.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.635543704032898, "sampling/importance_sampling_ratio/mean": 1.0003108978271484, "sampling/importance_sampling_ratio/min": 0.714081883430481, "sampling/sampling_logp_difference/max": 0.4919753074645996, "sampling/sampling_logp_difference/mean": 0.014954311773180962, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 174.65625, "completions/mean_terminated_length": 174.65625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4257662892341614, "epoch": 1.329656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.02157836716451573, "kl": 0.045886047184467316, "learning_rate": 6.839960391234242e-07, "loss": 0.0004, "num_tokens": 34441799.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3246698379516602, "sampling/importance_sampling_ratio/mean": 0.9998934268951416, "sampling/importance_sampling_ratio/min": 0.6550384759902954, "sampling/sampling_logp_difference/max": 0.4230612516403198, "sampling/sampling_logp_difference/mean": 0.015735294669866562, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 210.5625, "completions/mean_terminated_length": 210.5625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.34981435537338257, "epoch": 1.3308823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.01947714238593482, "kl": 0.029721500352025032, "learning_rate": 6.833334617309672e-07, "loss": 0.0003, "num_tokens": 34474811.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5748045444488525, "sampling/importance_sampling_ratio/mean": 1.000112771987915, "sampling/importance_sampling_ratio/min": 0.6208308339118958, "sampling/sampling_logp_difference/max": 0.4766967296600342, "sampling/sampling_logp_difference/mean": 0.014098942279815674, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.41365596652030945, "epoch": 1.3321078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.04542017220947505, "kl": 0.06938691437244415, "learning_rate": 6.826705121831976e-07, "loss": 0.0007, "num_tokens": 34502447.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.546083927154541, "sampling/importance_sampling_ratio/mean": 1.0008903741836548, "sampling/importance_sampling_ratio/min": 0.6928634643554688, "sampling/sampling_logp_difference/max": 0.43572521209716797, "sampling/sampling_logp_difference/mean": 0.014942721463739872, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 194.734375, "completions/mean_terminated_length": 194.734375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.37932366132736206, "epoch": 1.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.015491019048548296, "kl": 0.03091152012348175, "learning_rate": 6.820071918258605e-07, "loss": 0.0003, "num_tokens": 34533678.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3494160175323486, "sampling/importance_sampling_ratio/mean": 1.0000969171524048, "sampling/importance_sampling_ratio/min": 0.7206055521965027, "sampling/sampling_logp_difference/max": 0.3276634216308594, "sampling/sampling_logp_difference/mean": 0.015398137271404266, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 148.140625, "completions/mean_terminated_length": 148.140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3391225337982178, "epoch": 1.3345588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.043334780455243624, "kl": 0.040297143161296844, "learning_rate": 6.813435020054548e-07, "loss": 0.0004, "num_tokens": 34558567.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.375964879989624, "sampling/importance_sampling_ratio/mean": 1.0002107620239258, "sampling/importance_sampling_ratio/min": 0.18915744125843048, "sampling/sampling_logp_difference/max": 1.6651755571365356, "sampling/sampling_logp_difference/mean": 0.013925185427069664, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 140.5625, "completions/mean_terminated_length": 140.5625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.40812548995018005, "epoch": 1.3357843137254901, "frac_reward_zero_std": 0.5, "grad_norm": 1.5176622119872871, "kl": 0.06736913323402405, "learning_rate": 6.806794440692282e-07, "loss": -0.0521, "num_tokens": 34584811.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3238334655761719, "sampling/importance_sampling_ratio/mean": 1.000195026397705, "sampling/importance_sampling_ratio/min": 0.6368677020072937, "sampling/sampling_logp_difference/max": 0.45119333267211914, "sampling/sampling_logp_difference/mean": 0.015394306741654873, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 184.078125, "completions/mean_terminated_length": 184.078125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.347525417804718, "epoch": 1.3370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.019446197898854066, "kl": 0.03219003975391388, "learning_rate": 6.800150193651767e-07, "loss": 0.0003, "num_tokens": 34613328.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3008006811141968, "sampling/importance_sampling_ratio/mean": 0.9996593594551086, "sampling/importance_sampling_ratio/min": 0.6955578327178955, "sampling/sampling_logp_difference/max": 0.3630410432815552, "sampling/sampling_logp_difference/mean": 0.012894319370388985, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 187.265625, "completions/mean_terminated_length": 187.265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3292316198348999, "epoch": 1.3382352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.01643373329539923, "kl": 0.029864570125937462, "learning_rate": 6.793502292420401e-07, "loss": 0.0003, "num_tokens": 34641025.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4144617319107056, "sampling/importance_sampling_ratio/mean": 0.9999804496765137, "sampling/importance_sampling_ratio/min": 0.6217187643051147, "sampling/sampling_logp_difference/max": 0.4752674102783203, "sampling/sampling_logp_difference/mean": 0.012645787559449673, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 234.296875, "completions/mean_terminated_length": 234.296875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5638883709907532, "epoch": 1.3394607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 1.1076845969024987, "kl": 0.05605001002550125, "learning_rate": 6.786850750493005e-07, "loss": -0.0405, "num_tokens": 34673972.0, "reward": -0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4678847789764404, "sampling/importance_sampling_ratio/mean": 1.000309705734253, "sampling/importance_sampling_ratio/min": 0.6138759255409241, "sampling/sampling_logp_difference/max": 0.4879624843597412, "sampling/sampling_logp_difference/mean": 0.017899105325341225, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3278416693210602, "epoch": 1.340686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.018262285100793196, "kl": 0.03446147218346596, "learning_rate": 6.780195581371784e-07, "loss": 0.0003, "num_tokens": 34698960.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4232503175735474, "sampling/importance_sampling_ratio/mean": 1.0003902912139893, "sampling/importance_sampling_ratio/min": 0.7128939032554626, "sampling/sampling_logp_difference/max": 0.35294318199157715, "sampling/sampling_logp_difference/mean": 0.01291445642709732, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 194.828125, "completions/mean_terminated_length": 194.828125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.34137165546417236, "epoch": 1.3419117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.014778505546742511, "kl": 0.02961350977420807, "learning_rate": 6.773536798566313e-07, "loss": 0.0003, "num_tokens": 34728965.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.441498875617981, "sampling/importance_sampling_ratio/mean": 1.0001440048217773, "sampling/importance_sampling_ratio/min": 0.7211413979530334, "sampling/sampling_logp_difference/max": 0.3656834363937378, "sampling/sampling_logp_difference/mean": 0.013568964786827564, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 219.421875, "completions/mean_terminated_length": 219.421875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3275423049926758, "epoch": 1.343137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.011822894651897455, "kl": 0.024325821548700333, "learning_rate": 6.766874415593495e-07, "loss": 0.0002, "num_tokens": 34760336.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5737786293029785, "sampling/importance_sampling_ratio/mean": 0.9997653365135193, "sampling/importance_sampling_ratio/min": 0.6470595002174377, "sampling/sampling_logp_difference/max": 0.453479528427124, "sampling/sampling_logp_difference/mean": 0.01228781696408987, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 178.703125, "completions/mean_terminated_length": 178.703125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.359050989151001, "epoch": 1.344362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.017683037402748594, "kl": 0.035708922892808914, "learning_rate": 6.760208445977549e-07, "loss": 0.0003, "num_tokens": 34786397.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2990641593933105, "sampling/importance_sampling_ratio/mean": 1.000245451927185, "sampling/importance_sampling_ratio/min": 0.6298472881317139, "sampling/sampling_logp_difference/max": 0.462277889251709, "sampling/sampling_logp_difference/mean": 0.013666082173585892, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 188.328125, "completions/mean_terminated_length": 188.328125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4512159824371338, "epoch": 1.3455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.589649904007106, "kl": 0.03694362938404083, "learning_rate": 6.753538903249974e-07, "loss": 0.0039, "num_tokens": 34824578.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4352165460586548, "sampling/importance_sampling_ratio/mean": 0.9999997019767761, "sampling/importance_sampling_ratio/min": 0.638515830039978, "sampling/sampling_logp_difference/max": 0.4486088752746582, "sampling/sampling_logp_difference/mean": 0.017055150121450424, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.5191019773483276, "epoch": 1.346813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.4883177965879764, "kl": 0.05407078564167023, "learning_rate": 6.74686580094951e-07, "loss": -0.0447, "num_tokens": 34855834.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4751386642456055, "sampling/importance_sampling_ratio/mean": 1.0001651048660278, "sampling/importance_sampling_ratio/min": 0.6531107425689697, "sampling/sampling_logp_difference/max": 0.42600858211517334, "sampling/sampling_logp_difference/mean": 0.017602190375328064, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 217.03125, "completions/mean_terminated_length": 217.03125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5169070959091187, "epoch": 1.3480392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.7540769944517511, "kl": 0.04155392199754715, "learning_rate": 6.740189152622142e-07, "loss": -0.0131, "num_tokens": 34889244.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6672985553741455, "sampling/importance_sampling_ratio/mean": 1.0004644393920898, "sampling/importance_sampling_ratio/min": 0.5918436646461487, "sampling/sampling_logp_difference/max": 0.524512767791748, "sampling/sampling_logp_difference/mean": 0.01677050068974495, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 166.21875, "completions/mean_terminated_length": 166.21875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.37008750438690186, "epoch": 1.3492647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.020667374575687864, "kl": 0.03254357725381851, "learning_rate": 6.733508971821036e-07, "loss": 0.0003, "num_tokens": 34915226.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3038923740386963, "sampling/importance_sampling_ratio/mean": 1.0001362562179565, "sampling/importance_sampling_ratio/min": 0.6747278571128845, "sampling/sampling_logp_difference/max": 0.39344584941864014, "sampling/sampling_logp_difference/mean": 0.01481970027089119, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 256.765625, "completions/mean_terminated_length": 256.765625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.47072160243988037, "epoch": 1.3504901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.01408650788272973, "kl": 0.026061909273266792, "learning_rate": 6.726825272106538e-07, "loss": 0.0002, "num_tokens": 34949867.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3423312902450562, "sampling/importance_sampling_ratio/mean": 1.0000892877578735, "sampling/importance_sampling_ratio/min": 0.6159026622772217, "sampling/sampling_logp_difference/max": 0.4846663475036621, "sampling/sampling_logp_difference/mean": 0.014897543005645275, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.5809694528579712, "epoch": 1.3517156862745099, "frac_reward_zero_std": 0.25, "grad_norm": 1.0102333432107318, "kl": 0.0789414718747139, "learning_rate": 6.720138067046134e-07, "loss": -0.0356, "num_tokens": 34982715.0, "reward": -0.25, "reward_std": 0.5765564441680908, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3394501209259033, "sampling/importance_sampling_ratio/mean": 1.000265121459961, "sampling/importance_sampling_ratio/min": 0.7155026793479919, "sampling/sampling_logp_difference/max": 0.33476996421813965, "sampling/sampling_logp_difference/mean": 0.01793723925948143, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 170.265625, "completions/mean_terminated_length": 170.265625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.28509607911109924, "epoch": 1.3529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0186612970010302, "kl": 0.035473503172397614, "learning_rate": 6.713447370214431e-07, "loss": 0.0003, "num_tokens": 35008444.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4462963342666626, "sampling/importance_sampling_ratio/mean": 1.000580072402954, "sampling/importance_sampling_ratio/min": 0.6890444755554199, "sampling/sampling_logp_difference/max": 0.3724493980407715, "sampling/sampling_logp_difference/mean": 0.011612621136009693, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 186.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4017951190471649, "epoch": 1.3541666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.020731791424474785, "kl": 0.03592874854803085, "learning_rate": 6.706753195193116e-07, "loss": 0.0004, "num_tokens": 35036120.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3652873039245605, "sampling/importance_sampling_ratio/mean": 0.999518871307373, "sampling/importance_sampling_ratio/min": 0.6376883387565613, "sampling/sampling_logp_difference/max": 0.4499056339263916, "sampling/sampling_logp_difference/mean": 0.015169748105108738, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 162.296875, "completions/mean_terminated_length": 162.296875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3948853611946106, "epoch": 1.3553921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.02274097173537011, "kl": 0.04195543751120567, "learning_rate": 6.700055555570941e-07, "loss": 0.0004, "num_tokens": 35063675.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.337462067604065, "sampling/importance_sampling_ratio/mean": 0.9998512268066406, "sampling/importance_sampling_ratio/min": 0.5934198498725891, "sampling/sampling_logp_difference/max": 0.5218531489372253, "sampling/sampling_logp_difference/mean": 0.015314958989620209, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 162.890625, "completions/mean_terminated_length": 162.890625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.35850122570991516, "epoch": 1.3566176470588236, "frac_reward_zero_std": 0.75, "grad_norm": 1.0855367154891349, "kl": 0.03481462597846985, "learning_rate": 6.693354464943688e-07, "loss": -0.0038, "num_tokens": 35089348.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6007455587387085, "sampling/importance_sampling_ratio/mean": 1.0004498958587646, "sampling/importance_sampling_ratio/min": 0.5633743405342102, "sampling/sampling_logp_difference/max": 0.5738110542297363, "sampling/sampling_logp_difference/mean": 0.014802731573581696, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 188.703125, "completions/mean_terminated_length": 188.703125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.28801846504211426, "epoch": 1.357843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.01740773670001392, "kl": 0.025060242041945457, "learning_rate": 6.68664993691415e-07, "loss": 0.0002, "num_tokens": 35123761.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.586329698562622, "sampling/importance_sampling_ratio/mean": 0.999829888343811, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.4614229202270508, "sampling/sampling_logp_difference/mean": 0.011413703672587872, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.39979416131973267, "epoch": 1.3590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.01650341982688638, "kl": 0.02971898391842842, "learning_rate": 6.679941985092092e-07, "loss": 0.0003, "num_tokens": 35156673.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5042294263839722, "sampling/importance_sampling_ratio/mean": 1.000229835510254, "sampling/importance_sampling_ratio/min": 0.6448522210121155, "sampling/sampling_logp_difference/max": 0.43873417377471924, "sampling/sampling_logp_difference/mean": 0.015918847173452377, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.46098411083221436, "epoch": 1.3602941176470589, "frac_reward_zero_std": 0.25, "grad_norm": 1.5147163764637823, "kl": 0.052413132041692734, "learning_rate": 6.673230623094231e-07, "loss": -0.0074, "num_tokens": 35189177.0, "reward": 0.5, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005366802215576, "sampling/importance_sampling_ratio/min": 0.5570670962333679, "sampling/sampling_logp_difference/max": 0.7709481716156006, "sampling/sampling_logp_difference/mean": 0.016182512044906616, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 302.609375, "completions/mean_terminated_length": 302.609375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.42158064246177673, "epoch": 1.3615196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 0.9139938155662435, "kl": 0.04148112237453461, "learning_rate": 6.666515864544208e-07, "loss": 0.02, "num_tokens": 35225936.0, "reward": 0.1875, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4334468841552734, "sampling/importance_sampling_ratio/mean": 0.9999098777770996, "sampling/importance_sampling_ratio/min": 0.7078362703323364, "sampling/sampling_logp_difference/max": 0.36008191108703613, "sampling/sampling_logp_difference/mean": 0.014339843764901161, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 177.84375, "completions/mean_terminated_length": 177.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4696926176548004, "epoch": 1.3627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9144773961228163, "kl": 0.06416451930999756, "learning_rate": 6.659797723072558e-07, "loss": 0.0167, "num_tokens": 35257430.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3812090158462524, "sampling/importance_sampling_ratio/mean": 1.0007481575012207, "sampling/importance_sampling_ratio/min": 0.7125915884971619, "sampling/sampling_logp_difference/max": 0.33884692192077637, "sampling/sampling_logp_difference/mean": 0.017195500433444977, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 225.984375, "completions/mean_terminated_length": 225.984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.36867815256118774, "epoch": 1.3639705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.6677750693593473, "kl": 0.0362837016582489, "learning_rate": 6.653076212316681e-07, "loss": -0.0033, "num_tokens": 35291461.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4058878421783447, "sampling/importance_sampling_ratio/mean": 0.9996570944786072, "sampling/importance_sampling_ratio/min": 0.7330043911933899, "sampling/sampling_logp_difference/max": 0.34066903591156006, "sampling/sampling_logp_difference/mean": 0.013059025630354881, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 204.609375, "completions/mean_terminated_length": 204.609375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3267783224582672, "epoch": 1.3651960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.019101102209227092, "kl": 0.02914278395473957, "learning_rate": 6.646351345920818e-07, "loss": 0.0003, "num_tokens": 35323244.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4072213172912598, "sampling/importance_sampling_ratio/mean": 1.0003342628479004, "sampling/importance_sampling_ratio/min": 0.6636192202568054, "sampling/sampling_logp_difference/max": 0.41004669666290283, "sampling/sampling_logp_difference/mean": 0.013215331360697746, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 222.109375, "completions/mean_terminated_length": 222.109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3697851896286011, "epoch": 1.366421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.027466954235500943, "kl": 0.03940945118665695, "learning_rate": 6.639623137536022e-07, "loss": 0.0004, "num_tokens": 35353187.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4167759418487549, "sampling/importance_sampling_ratio/mean": 0.9999490976333618, "sampling/importance_sampling_ratio/min": 0.7221565246582031, "sampling/sampling_logp_difference/max": 0.34838390350341797, "sampling/sampling_logp_difference/mean": 0.013105101883411407, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 197.53125, "completions/mean_terminated_length": 197.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3024361729621887, "epoch": 1.3676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.019841543706668845, "kl": 0.032502297312021255, "learning_rate": 6.63289160082013e-07, "loss": 0.0003, "num_tokens": 35381221.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5686460733413696, "sampling/importance_sampling_ratio/mean": 1.0004857778549194, "sampling/importance_sampling_ratio/min": 0.45635396242141724, "sampling/sampling_logp_difference/max": 0.7844865322113037, "sampling/sampling_logp_difference/mean": 0.013303949497640133, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 188.265625, "completions/mean_terminated_length": 188.265625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.43511879444122314, "epoch": 1.3688725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.01976911540588862, "kl": 0.03235086798667908, "learning_rate": 6.626156749437736e-07, "loss": 0.0003, "num_tokens": 35410870.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4674023389816284, "sampling/importance_sampling_ratio/mean": 0.9996806979179382, "sampling/importance_sampling_ratio/min": 0.5858292579650879, "sampling/sampling_logp_difference/max": 0.5347268581390381, "sampling/sampling_logp_difference/mean": 0.015992164611816406, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 190.765625, "completions/mean_terminated_length": 190.765625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2932688593864441, "epoch": 1.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.6935468427767087, "kl": 0.04061746224761009, "learning_rate": 6.619418597060159e-07, "loss": -0.0117, "num_tokens": 35440487.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 1.0001976490020752, "sampling/importance_sampling_ratio/min": 0.6843438148498535, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.012141115963459015, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 167.03125, "completions/mean_terminated_length": 167.03125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3596198558807373, "epoch": 1.3713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.023028208968430264, "kl": 0.035580575466156006, "learning_rate": 6.612677157365425e-07, "loss": 0.0003, "num_tokens": 35468217.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3988498449325562, "sampling/importance_sampling_ratio/mean": 1.000089168548584, "sampling/importance_sampling_ratio/min": 0.6547345519065857, "sampling/sampling_logp_difference/max": 0.42352545261383057, "sampling/sampling_logp_difference/mean": 0.014147953130304813, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 215.78125, "completions/mean_terminated_length": 215.78125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.4859886169433594, "epoch": 1.3725490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.777610592194121, "kl": 0.05148335546255112, "learning_rate": 6.605932444038228e-07, "loss": 0.0142, "num_tokens": 35498235.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.455161452293396, "sampling/importance_sampling_ratio/mean": 1.0001635551452637, "sampling/importance_sampling_ratio/min": 0.6208274364471436, "sampling/sampling_logp_difference/max": 0.4767022132873535, "sampling/sampling_logp_difference/mean": 0.017418760806322098, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 224.53125, "completions/mean_terminated_length": 224.53125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.38453611731529236, "epoch": 1.3737745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.0369864012470773, "kl": 0.042825035750865936, "learning_rate": 6.599184470769908e-07, "loss": -0.0082, "num_tokens": 35527037.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5212843418121338, "sampling/importance_sampling_ratio/mean": 0.9996752738952637, "sampling/importance_sampling_ratio/min": 0.6630414128303528, "sampling/sampling_logp_difference/max": 0.4195549488067627, "sampling/sampling_logp_difference/mean": 0.01430191844701767, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 227.296875, "completions/mean_terminated_length": 227.296875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3261297345161438, "epoch": 1.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.053828586160433615, "kl": 0.03368963301181793, "learning_rate": 6.592433251258422e-07, "loss": 0.0003, "num_tokens": 35563776.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.434326410293579, "sampling/importance_sampling_ratio/mean": 0.999552309513092, "sampling/importance_sampling_ratio/min": 0.5676478147506714, "sampling/sampling_logp_difference/max": 0.5662540197372437, "sampling/sampling_logp_difference/mean": 0.0120453592389822, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 212.828125, "completions/mean_terminated_length": 212.828125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3626338541507721, "epoch": 1.3762254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.01771876735195674, "kl": 0.03627105802297592, "learning_rate": 6.58567879920832e-07, "loss": 0.0003, "num_tokens": 35595109.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5750272274017334, "sampling/importance_sampling_ratio/mean": 1.0000033378601074, "sampling/importance_sampling_ratio/min": 0.6057386994361877, "sampling/sampling_logp_difference/max": 0.5013065338134766, "sampling/sampling_logp_difference/mean": 0.014857176691293716, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 156.8125, "completions/mean_terminated_length": 156.8125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28793010115623474, "epoch": 1.3774509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02618859955521471, "kl": 0.03437525033950806, "learning_rate": 6.578921128330714e-07, "loss": 0.0003, "num_tokens": 35619017.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5308873653411865, "sampling/importance_sampling_ratio/mean": 0.9994142055511475, "sampling/importance_sampling_ratio/min": 0.6210438013076782, "sampling/sampling_logp_difference/max": 0.47635364532470703, "sampling/sampling_logp_difference/mean": 0.012552684172987938, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 196.609375, "completions/mean_terminated_length": 196.609375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.42831873893737793, "epoch": 1.3786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018212638858156226, "kl": 0.036771561950445175, "learning_rate": 6.572160252343242e-07, "loss": 0.0003, "num_tokens": 35652368.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5762969255447388, "sampling/importance_sampling_ratio/mean": 1.0003396272659302, "sampling/importance_sampling_ratio/min": 0.6428821682929993, "sampling/sampling_logp_difference/max": 0.4550783634185791, "sampling/sampling_logp_difference/mean": 0.0155325997620821, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 173.890625, "completions/mean_terminated_length": 173.890625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.45053666830062866, "epoch": 1.3799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.025006269869773442, "kl": 0.04868462681770325, "learning_rate": 6.565396184970059e-07, "loss": 0.0005, "num_tokens": 35683641.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.283663034439087, "sampling/importance_sampling_ratio/mean": 0.9999324083328247, "sampling/importance_sampling_ratio/min": 0.6247842907905579, "sampling/sampling_logp_difference/max": 0.4703488349914551, "sampling/sampling_logp_difference/mean": 0.017172731459140778, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 222.09375, "completions/mean_terminated_length": 222.09375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3818378448486328, "epoch": 1.3811274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.014529073554556921, "kl": 0.03208887577056885, "learning_rate": 6.558628939941791e-07, "loss": 0.0003, "num_tokens": 35720255.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3865206241607666, "sampling/importance_sampling_ratio/mean": 1.0001908540725708, "sampling/importance_sampling_ratio/min": 0.6459605097770691, "sampling/sampling_logp_difference/max": 0.43701696395874023, "sampling/sampling_logp_difference/mean": 0.0143346032127738, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 239.890625, "completions/mean_terminated_length": 239.890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5276911854743958, "epoch": 1.3823529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.8103121218514556, "kl": 0.045806415379047394, "learning_rate": 6.551858530995517e-07, "loss": -0.0098, "num_tokens": 35756152.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.465988039970398, "sampling/importance_sampling_ratio/mean": 1.000345230102539, "sampling/importance_sampling_ratio/min": 0.5677291750907898, "sampling/sampling_logp_difference/max": 0.5661107301712036, "sampling/sampling_logp_difference/mean": 0.015663184225559235, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 212.484375, "completions/mean_terminated_length": 212.484375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4683011770248413, "epoch": 1.383578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01798580085049312, "kl": 0.030934587121009827, "learning_rate": 6.545084971874736e-07, "loss": 0.0003, "num_tokens": 35790439.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.290934443473816, "sampling/importance_sampling_ratio/mean": 0.9998828768730164, "sampling/importance_sampling_ratio/min": 0.6298438310623169, "sampling/sampling_logp_difference/max": 0.4622833728790283, "sampling/sampling_logp_difference/mean": 0.01621684618294239, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 236.390625, "completions/mean_terminated_length": 236.390625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3739738464355469, "epoch": 1.3848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.016134855144591998, "kl": 0.028364058583974838, "learning_rate": 6.538308276329349e-07, "loss": 0.0003, "num_tokens": 35825600.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4213730096817017, "sampling/importance_sampling_ratio/mean": 1.0001099109649658, "sampling/importance_sampling_ratio/min": 0.6817567348480225, "sampling/sampling_logp_difference/max": 0.38308238983154297, "sampling/sampling_logp_difference/mean": 0.012722737155854702, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 180.484375, "completions/mean_terminated_length": 180.484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.4150697886943817, "epoch": 1.3860294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.02778097487537663, "kl": 0.055892542004585266, "learning_rate": 6.531528458115614e-07, "loss": 0.0005, "num_tokens": 35852735.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3184069395065308, "sampling/importance_sampling_ratio/mean": 0.9997545480728149, "sampling/importance_sampling_ratio/min": 0.6922799944877625, "sampling/sampling_logp_difference/max": 0.36776483058929443, "sampling/sampling_logp_difference/mean": 0.014705037698149681, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 214.015625, "completions/mean_terminated_length": 214.015625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.38773012161254883, "epoch": 1.3872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.018687893142765078, "kl": 0.03789406269788742, "learning_rate": 6.524745530996136e-07, "loss": 0.0004, "num_tokens": 35885040.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.381638526916504, "sampling/importance_sampling_ratio/mean": 1.0002174377441406, "sampling/importance_sampling_ratio/min": 0.7143754959106445, "sampling/sampling_logp_difference/max": 0.3363466262817383, "sampling/sampling_logp_difference/mean": 0.013985749334096909, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 182.1875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3540676236152649, "epoch": 1.3884803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.01958145904747897, "kl": 0.03956735134124756, "learning_rate": 6.517959508739825e-07, "loss": 0.0004, "num_tokens": 35915516.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6641453504562378, "sampling/importance_sampling_ratio/mean": 0.9995432496070862, "sampling/importance_sampling_ratio/min": 0.6927388906478882, "sampling/sampling_logp_difference/max": 0.5093116760253906, "sampling/sampling_logp_difference/mean": 0.013443275354802608, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 238.46875, "completions/mean_terminated_length": 238.46875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3899323344230652, "epoch": 1.3897058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.014861637691700475, "kl": 0.03583185374736786, "learning_rate": 6.511170405121877e-07, "loss": 0.0003, "num_tokens": 35947722.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.627052664756775, "sampling/importance_sampling_ratio/mean": 0.9995888471603394, "sampling/importance_sampling_ratio/min": 0.7298383712768555, "sampling/sampling_logp_difference/max": 0.4867701530456543, "sampling/sampling_logp_difference/mean": 0.01395932212471962, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 167.21875, "completions/mean_terminated_length": 167.21875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3784646689891815, "epoch": 1.3909313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.8825164568974678, "kl": 0.058056920766830444, "learning_rate": 6.504378233923742e-07, "loss": 0.0037, "num_tokens": 35972984.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5136674642562866, "sampling/importance_sampling_ratio/mean": 1.000325083732605, "sampling/importance_sampling_ratio/min": 0.6269902586936951, "sampling/sampling_logp_difference/max": 0.4668242931365967, "sampling/sampling_logp_difference/mean": 0.015267467126250267, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 200.921875, "completions/mean_terminated_length": 200.921875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4866487383842468, "epoch": 1.392156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.020093671919080434, "kl": 0.044742751866579056, "learning_rate": 6.497583008933097e-07, "loss": 0.0004, "num_tokens": 36003939.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3623809814453125, "sampling/importance_sampling_ratio/mean": 0.9999260306358337, "sampling/importance_sampling_ratio/min": 0.6157806515693665, "sampling/sampling_logp_difference/max": 0.4848644733428955, "sampling/sampling_logp_difference/mean": 0.016255199909210205, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 202.6875, "completions/mean_terminated_length": 202.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.341106116771698, "epoch": 1.3933823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.018811027989333717, "kl": 0.0390816330909729, "learning_rate": 6.490784743943818e-07, "loss": 0.0004, "num_tokens": 36032239.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6553131341934204, "sampling/importance_sampling_ratio/mean": 1.0006049871444702, "sampling/importance_sampling_ratio/min": 0.7228627800941467, "sampling/sampling_logp_difference/max": 0.5039901733398438, "sampling/sampling_logp_difference/mean": 0.013343091122806072, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 203.171875, "completions/mean_terminated_length": 203.171875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4166279137134552, "epoch": 1.3946078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0201380357592192, "kl": 0.03564707189798355, "learning_rate": 6.483983452755952e-07, "loss": 0.0003, "num_tokens": 36067290.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4581257104873657, "sampling/importance_sampling_ratio/mean": 1.0003366470336914, "sampling/importance_sampling_ratio/min": 0.6117782592773438, "sampling/sampling_logp_difference/max": 0.49138545989990234, "sampling/sampling_logp_difference/mean": 0.014345650561153889, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4097669720649719, "epoch": 1.3958333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.015900443786307623, "kl": 0.03139366954565048, "learning_rate": 6.477179149175692e-07, "loss": 0.0003, "num_tokens": 36104642.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 0.9999328255653381, "sampling/importance_sampling_ratio/min": 0.6221033930778503, "sampling/sampling_logp_difference/max": 0.48689985275268555, "sampling/sampling_logp_difference/mean": 0.014695117250084877, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5011221170425415, "epoch": 1.3970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.017791330374902426, "kl": 0.04139325022697449, "learning_rate": 6.470371847015341e-07, "loss": 0.0004, "num_tokens": 36140154.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3758145570755005, "sampling/importance_sampling_ratio/mean": 1.0001449584960938, "sampling/importance_sampling_ratio/min": 0.6445356011390686, "sampling/sampling_logp_difference/max": 0.4392251968383789, "sampling/sampling_logp_difference/mean": 0.016765639185905457, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 212.125, "completions/mean_terminated_length": 212.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4664650559425354, "epoch": 1.3982843137254901, "frac_reward_zero_std": 0.75, "grad_norm": 0.871845786759249, "kl": 0.045797113329172134, "learning_rate": 6.463561560093292e-07, "loss": -0.0076, "num_tokens": 36172914.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.572105050086975, "sampling/importance_sampling_ratio/mean": 1.0001112222671509, "sampling/importance_sampling_ratio/min": 0.6616653800010681, "sampling/sampling_logp_difference/max": 0.45241546630859375, "sampling/sampling_logp_difference/mean": 0.016561444848775864, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 294.265625, "completions/mean_terminated_length": 294.265625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3774057626724243, "epoch": 1.3995098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.013212440900637555, "kl": 0.024201175197958946, "learning_rate": 6.456748302233994e-07, "loss": 0.0002, "num_tokens": 36209235.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5392427444458008, "sampling/importance_sampling_ratio/mean": 1.0007257461547852, "sampling/importance_sampling_ratio/min": 0.7199281454086304, "sampling/sampling_logp_difference/max": 0.4312906265258789, "sampling/sampling_logp_difference/mean": 0.012639081105589867, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 163.484375, "completions/mean_terminated_length": 163.484375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.4590701460838318, "epoch": 1.4007352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 1.2703788828748115, "kl": 0.050560418516397476, "learning_rate": 6.449932087267931e-07, "loss": -0.0102, "num_tokens": 36235458.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.402066946029663, "sampling/importance_sampling_ratio/mean": 1.0001840591430664, "sampling/importance_sampling_ratio/min": 0.6709317564964294, "sampling/sampling_logp_difference/max": 0.39908790588378906, "sampling/sampling_logp_difference/mean": 0.0171342883259058, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 176.359375, "completions/mean_terminated_length": 176.359375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.38662445545196533, "epoch": 1.4019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.017236828833540965, "kl": 0.037596482783555984, "learning_rate": 6.443112929031586e-07, "loss": 0.0003, "num_tokens": 36261545.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3856422901153564, "sampling/importance_sampling_ratio/mean": 0.9996615648269653, "sampling/importance_sampling_ratio/min": 0.6394821405410767, "sampling/sampling_logp_difference/max": 0.447096586227417, "sampling/sampling_logp_difference/mean": 0.014809362590312958, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.40431851148605347, "epoch": 1.403186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.021331365625703074, "kl": 0.03596138954162598, "learning_rate": 6.43629084136742e-07, "loss": 0.0003, "num_tokens": 36293065.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6057169437408447, "sampling/importance_sampling_ratio/mean": 1.0001697540283203, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.4735703468322754, "sampling/sampling_logp_difference/mean": 0.014951720833778381, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 207.734375, "completions/mean_terminated_length": 207.734375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4531485140323639, "epoch": 1.4044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.018203394563173974, "kl": 0.04105984419584274, "learning_rate": 6.429465838123838e-07, "loss": 0.0004, "num_tokens": 36323656.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6343212127685547, "sampling/importance_sampling_ratio/mean": 0.9997382760047913, "sampling/importance_sampling_ratio/min": 0.6331372857093811, "sampling/sampling_logp_difference/max": 0.4912276268005371, "sampling/sampling_logp_difference/mean": 0.01589648798108101, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 259.3125, "completions/mean_terminated_length": 259.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3934080898761749, "epoch": 1.405637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.020914130194614033, "kl": 0.035692743957042694, "learning_rate": 6.422637933155162e-07, "loss": 0.0003, "num_tokens": 36358460.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2907326221466064, "sampling/importance_sampling_ratio/mean": 0.9999151229858398, "sampling/importance_sampling_ratio/min": 0.675743579864502, "sampling/sampling_logp_difference/max": 0.39194154739379883, "sampling/sampling_logp_difference/mean": 0.014120127074420452, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 152.8125, "completions/mean_terminated_length": 152.8125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.29732319712638855, "epoch": 1.406862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.020387493413349046, "kl": 0.03376832604408264, "learning_rate": 6.41580714032161e-07, "loss": 0.0003, "num_tokens": 36381648.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5645488500595093, "sampling/importance_sampling_ratio/mean": 1.0004475116729736, "sampling/importance_sampling_ratio/min": 0.6080793738365173, "sampling/sampling_logp_difference/max": 0.4974498748779297, "sampling/sampling_logp_difference/mean": 0.012887522578239441, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 177.53125, "completions/mean_terminated_length": 177.53125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.41781967878341675, "epoch": 1.4080882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.02404443127113517, "kl": 0.04842346906661987, "learning_rate": 6.408973473489257e-07, "loss": 0.0004, "num_tokens": 36408722.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8529335260391235, "sampling/importance_sampling_ratio/mean": 0.9999231696128845, "sampling/importance_sampling_ratio/min": 0.6887311339378357, "sampling/sampling_logp_difference/max": 0.6167700290679932, "sampling/sampling_logp_difference/mean": 0.015981681644916534, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4293907880783081, "epoch": 1.409313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.02367845774403614, "kl": 0.045156046748161316, "learning_rate": 6.402136946530014e-07, "loss": 0.0004, "num_tokens": 36440946.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4513518810272217, "sampling/importance_sampling_ratio/mean": 1.0003032684326172, "sampling/importance_sampling_ratio/min": 0.6622359752655029, "sampling/sampling_logp_difference/max": 0.4121333360671997, "sampling/sampling_logp_difference/mean": 0.01704506203532219, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 224.109375, "completions/mean_terminated_length": 224.109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4653753340244293, "epoch": 1.4105392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.9583863934305666, "kl": 0.040310852229595184, "learning_rate": 6.395297573321597e-07, "loss": 0.041, "num_tokens": 36470825.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.656188726425171, "sampling/importance_sampling_ratio/mean": 1.0000224113464355, "sampling/importance_sampling_ratio/min": 0.6066967248916626, "sampling/sampling_logp_difference/max": 0.504518985748291, "sampling/sampling_logp_difference/mean": 0.01696554198861122, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 259.203125, "completions/mean_terminated_length": 259.203125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.4103810787200928, "epoch": 1.4117647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.016200855867439422, "kl": 0.027447929605841637, "learning_rate": 6.388455367747502e-07, "loss": 0.0003, "num_tokens": 36506230.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4783718585968018, "sampling/importance_sampling_ratio/mean": 1.0000253915786743, "sampling/importance_sampling_ratio/min": 0.7335873246192932, "sampling/sampling_logp_difference/max": 0.3909413814544678, "sampling/sampling_logp_difference/mean": 0.013810326345264912, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 251.21875, "completions/mean_terminated_length": 251.21875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4707592725753784, "epoch": 1.4129901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7627371249958246, "kl": 0.04826528578996658, "learning_rate": 6.38161034369697e-07, "loss": -0.0081, "num_tokens": 36539972.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.502083420753479, "sampling/importance_sampling_ratio/mean": 0.9995489716529846, "sampling/importance_sampling_ratio/min": 0.5256656408309937, "sampling/sampling_logp_difference/max": 0.6430898904800415, "sampling/sampling_logp_difference/mean": 0.015454958193004131, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 201.421875, "completions/mean_terminated_length": 201.421875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3892950415611267, "epoch": 1.4142156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.017690955857901577, "kl": 0.028931012377142906, "learning_rate": 6.37476251506497e-07, "loss": 0.0003, "num_tokens": 36568895.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4651638269424438, "sampling/importance_sampling_ratio/mean": 1.0001156330108643, "sampling/importance_sampling_ratio/min": 0.649138331413269, "sampling/sampling_logp_difference/max": 0.43210935592651367, "sampling/sampling_logp_difference/mean": 0.013972951099276543, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 202.6875, "completions/mean_terminated_length": 202.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.519242525100708, "epoch": 1.4154411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.02135539836263351, "kl": 0.04262981563806534, "learning_rate": 6.367911895752158e-07, "loss": 0.0004, "num_tokens": 36603019.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4156533479690552, "sampling/importance_sampling_ratio/mean": 0.9998533725738525, "sampling/importance_sampling_ratio/min": 0.6932653784751892, "sampling/sampling_logp_difference/max": 0.3663424253463745, "sampling/sampling_logp_difference/mean": 0.016897693276405334, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 230.453125, "completions/mean_terminated_length": 230.453125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.416151762008667, "epoch": 1.4166666666666667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0189186992301897, "kl": 0.03677485138177872, "learning_rate": 6.361058499664855e-07, "loss": 0.0004, "num_tokens": 36638200.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6617846488952637, "sampling/importance_sampling_ratio/mean": 0.9995614886283875, "sampling/importance_sampling_ratio/min": 0.6192152500152588, "sampling/sampling_logp_difference/max": 0.5078921318054199, "sampling/sampling_logp_difference/mean": 0.014120293781161308, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 197.796875, "completions/mean_terminated_length": 197.796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.47481685876846313, "epoch": 1.4178921568627452, "frac_reward_zero_std": 0.75, "grad_norm": 0.7106213381086652, "kl": 0.039433401077985764, "learning_rate": 6.354202340715026e-07, "loss": -0.001, "num_tokens": 36669771.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.373407006263733, "sampling/importance_sampling_ratio/mean": 0.999471127986908, "sampling/importance_sampling_ratio/min": 0.6230600476264954, "sampling/sampling_logp_difference/max": 0.4731123447418213, "sampling/sampling_logp_difference/mean": 0.016685422509908676, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 182.328125, "completions/mean_terminated_length": 182.328125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3988850712776184, "epoch": 1.4191176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.021343238010666843, "kl": 0.04164959117770195, "learning_rate": 6.347343432820234e-07, "loss": 0.0004, "num_tokens": 36701792.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3889849185943604, "sampling/importance_sampling_ratio/mean": 1.000058889389038, "sampling/importance_sampling_ratio/min": 0.6187108159065247, "sampling/sampling_logp_difference/max": 0.4801173210144043, "sampling/sampling_logp_difference/mean": 0.015087027102708817, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 182.046875, "completions/mean_terminated_length": 182.046875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4099763035774231, "epoch": 1.420343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.018067353414028858, "kl": 0.034441027790308, "learning_rate": 6.340481789903634e-07, "loss": 0.0003, "num_tokens": 36737171.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6580297946929932, "sampling/importance_sampling_ratio/mean": 0.9997607469558716, "sampling/importance_sampling_ratio/min": 0.7133997082710266, "sampling/sampling_logp_difference/max": 0.5056300163269043, "sampling/sampling_logp_difference/mean": 0.014864427037537098, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 200.40625, "completions/mean_terminated_length": 200.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4773097634315491, "epoch": 1.4215686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.0293788881414854, "kl": 0.0406605489552021, "learning_rate": 6.333617425893919e-07, "loss": -0.0233, "num_tokens": 36765997.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3286681175231934, "sampling/importance_sampling_ratio/mean": 1.0002083778381348, "sampling/importance_sampling_ratio/min": 0.6262644529342651, "sampling/sampling_logp_difference/max": 0.46798253059387207, "sampling/sampling_logp_difference/mean": 0.01621146872639656, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 171.65625, "completions/mean_terminated_length": 171.65625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.37978655099868774, "epoch": 1.4227941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.018114489749995775, "kl": 0.03288944438099861, "learning_rate": 6.326750354725319e-07, "loss": 0.0003, "num_tokens": 36794743.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.800256371498108, "sampling/importance_sampling_ratio/mean": 1.0006855726242065, "sampling/importance_sampling_ratio/min": 0.617500901222229, "sampling/sampling_logp_difference/max": 0.5879291296005249, "sampling/sampling_logp_difference/mean": 0.015034069307148457, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 213.265625, "completions/mean_terminated_length": 213.265625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.44157570600509644, "epoch": 1.4240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.015580433843617932, "kl": 0.026813317090272903, "learning_rate": 6.319880590337548e-07, "loss": 0.0003, "num_tokens": 36827144.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000941753387451, "sampling/importance_sampling_ratio/min": 0.6778589487075806, "sampling/sampling_logp_difference/max": 0.8838214874267578, "sampling/sampling_logp_difference/mean": 0.016190864145755768, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4329409599304199, "epoch": 1.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03626490405225596, "kl": 0.04385094344615936, "learning_rate": 6.313008146675799e-07, "loss": 0.0004, "num_tokens": 36856030.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.393547773361206, "sampling/importance_sampling_ratio/mean": 1.0001081228256226, "sampling/importance_sampling_ratio/min": 0.6501546502113342, "sampling/sampling_logp_difference/max": 0.4305450916290283, "sampling/sampling_logp_difference/mean": 0.017302222549915314, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 232.28125, "completions/mean_terminated_length": 232.28125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.33402836322784424, "epoch": 1.4264705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.012219846207602742, "kl": 0.02296830713748932, "learning_rate": 6.306133037690692e-07, "loss": 0.0002, "num_tokens": 36888976.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3758339881896973, "sampling/importance_sampling_ratio/mean": 1.0000852346420288, "sampling/importance_sampling_ratio/min": 0.6108908653259277, "sampling/sampling_logp_difference/max": 0.49283695220947266, "sampling/sampling_logp_difference/mean": 0.012221208773553371, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 189.703125, "completions/mean_terminated_length": 189.703125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3288557827472687, "epoch": 1.4276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.018641186944606498, "kl": 0.0329284593462944, "learning_rate": 6.299255277338264e-07, "loss": 0.0003, "num_tokens": 36920125.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4268949031829834, "sampling/importance_sampling_ratio/mean": 1.0001251697540283, "sampling/importance_sampling_ratio/min": 0.6372838616371155, "sampling/sampling_logp_difference/max": 0.45054006576538086, "sampling/sampling_logp_difference/mean": 0.01316780410706997, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 188.265625, "completions/mean_terminated_length": 188.265625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3560446500778198, "epoch": 1.428921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.0224097862378743, "kl": 0.0396597683429718, "learning_rate": 6.292374879579934e-07, "loss": 0.0004, "num_tokens": 36946382.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4545938968658447, "sampling/importance_sampling_ratio/mean": 1.0007069110870361, "sampling/importance_sampling_ratio/min": 0.6191086173057556, "sampling/sampling_logp_difference/max": 0.4794745445251465, "sampling/sampling_logp_difference/mean": 0.013722263276576996, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 210.453125, "completions/mean_terminated_length": 210.453125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.40921494364738464, "epoch": 1.4301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.022115861694415706, "kl": 0.03952673822641373, "learning_rate": 6.285491858382473e-07, "loss": 0.0004, "num_tokens": 36979387.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4917949438095093, "sampling/importance_sampling_ratio/mean": 1.0006740093231201, "sampling/importance_sampling_ratio/min": 0.7042884230613708, "sampling/sampling_logp_difference/max": 0.3999800682067871, "sampling/sampling_logp_difference/mean": 0.015281391330063343, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 258.984375, "completions/mean_terminated_length": 258.984375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.473724901676178, "epoch": 1.4313725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.022753461567393787, "kl": 0.04564649984240532, "learning_rate": 6.278606227717978e-07, "loss": 0.0004, "num_tokens": 37017994.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.27568781375885, "sampling/importance_sampling_ratio/mean": 0.9997217059135437, "sampling/importance_sampling_ratio/min": 0.6260144114494324, "sampling/sampling_logp_difference/max": 0.4683818817138672, "sampling/sampling_logp_difference/mean": 0.015333606861531734, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 209.078125, "completions/mean_terminated_length": 209.078125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4920414090156555, "epoch": 1.4325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01657223631128751, "kl": 0.035376884043216705, "learning_rate": 6.271718001563843e-07, "loss": 0.0003, "num_tokens": 37047711.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4162039756774902, "sampling/importance_sampling_ratio/mean": 0.9997128248214722, "sampling/importance_sampling_ratio/min": 0.6210846304893494, "sampling/sampling_logp_difference/max": 0.476287841796875, "sampling/sampling_logp_difference/mean": 0.016096480190753937, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 221.890625, "completions/mean_terminated_length": 221.890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4462301433086395, "epoch": 1.4338235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.8204554633995234, "kl": 0.04333376884460449, "learning_rate": 6.264827193902731e-07, "loss": -0.0225, "num_tokens": 37084008.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.578283667564392, "sampling/importance_sampling_ratio/mean": 1.000231146812439, "sampling/importance_sampling_ratio/min": 0.6549274921417236, "sampling/sampling_logp_difference/max": 0.45633792877197266, "sampling/sampling_logp_difference/mean": 0.01599014550447464, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 210.515625, "completions/mean_terminated_length": 210.515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.5151039958000183, "epoch": 1.4350490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.04422786233709827, "kl": 0.06160306930541992, "learning_rate": 6.257933818722542e-07, "loss": 0.0005, "num_tokens": 37117737.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3585052490234375, "sampling/importance_sampling_ratio/mean": 1.0009124279022217, "sampling/importance_sampling_ratio/min": 0.6124889850616455, "sampling/sampling_logp_difference/max": 0.49022436141967773, "sampling/sampling_logp_difference/mean": 0.018399199470877647, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 200.28125, "completions/mean_terminated_length": 200.28125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.46506404876708984, "epoch": 1.4362745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.7820850745089246, "kl": 0.03863566368818283, "learning_rate": 6.251037890016395e-07, "loss": -0.0308, "num_tokens": 37145963.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4675662517547607, "sampling/importance_sampling_ratio/mean": 1.000497817993164, "sampling/importance_sampling_ratio/min": 0.7670260667800903, "sampling/sampling_logp_difference/max": 0.38360536098480225, "sampling/sampling_logp_difference/mean": 0.016250811517238617, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 206.34375, "completions/mean_terminated_length": 206.34375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.41054677963256836, "epoch": 1.4375, "frac_reward_zero_std": 0.75, "grad_norm": 0.6938400982338712, "kl": 0.045754268765449524, "learning_rate": 6.244139421782587e-07, "loss": -0.0011, "num_tokens": 37172913.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4826430082321167, "sampling/importance_sampling_ratio/mean": 0.9999325275421143, "sampling/importance_sampling_ratio/min": 0.6914175748825073, "sampling/sampling_logp_difference/max": 0.39382636547088623, "sampling/sampling_logp_difference/mean": 0.013678722083568573, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 237.3125, "completions/mean_terminated_length": 237.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3614650070667267, "epoch": 1.4387254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.013151961876267592, "kl": 0.02268226258456707, "learning_rate": 6.237238428024571e-07, "loss": 0.0002, "num_tokens": 37207365.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5792675018310547, "sampling/importance_sampling_ratio/mean": 1.0003628730773926, "sampling/importance_sampling_ratio/min": 0.5141401886940002, "sampling/sampling_logp_difference/max": 0.6652593612670898, "sampling/sampling_logp_difference/mean": 0.014220098033547401, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 185.46875, "completions/mean_terminated_length": 185.46875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.40694963932037354, "epoch": 1.4399509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.019817491208471438, "kl": 0.04109745845198631, "learning_rate": 6.230334922750929e-07, "loss": 0.0004, "num_tokens": 37233379.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6207282543182373, "sampling/importance_sampling_ratio/mean": 1.00010347366333, "sampling/importance_sampling_ratio/min": 0.6079967617988586, "sampling/sampling_logp_difference/max": 0.4975857734680176, "sampling/sampling_logp_difference/mean": 0.015249053947627544, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 193.9375, "completions/mean_terminated_length": 193.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.34148937463760376, "epoch": 1.4411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02603340705294334, "kl": 0.030143603682518005, "learning_rate": 6.223428919975338e-07, "loss": 0.0003, "num_tokens": 37264687.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5455199480056763, "sampling/importance_sampling_ratio/mean": 1.0002055168151855, "sampling/importance_sampling_ratio/min": 0.6264569759368896, "sampling/sampling_logp_difference/max": 0.46767520904541016, "sampling/sampling_logp_difference/mean": 0.01314076129347086, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 219.09375, "completions/mean_terminated_length": 219.09375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.48918333649635315, "epoch": 1.4424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.040913893462463, "kl": 0.07333552837371826, "learning_rate": 6.216520433716544e-07, "loss": 0.0169, "num_tokens": 37296421.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.434165358543396, "sampling/importance_sampling_ratio/mean": 0.9998877644538879, "sampling/importance_sampling_ratio/min": 0.6926529407501221, "sampling/sampling_logp_difference/max": 0.367226243019104, "sampling/sampling_logp_difference/mean": 0.01609702780842781, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 206.578125, "completions/mean_terminated_length": 206.578125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.45933449268341064, "epoch": 1.4436274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.015941078259987168, "kl": 0.03184128180146217, "learning_rate": 6.209609477998338e-07, "loss": 0.0003, "num_tokens": 37330490.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6488854885101318, "sampling/importance_sampling_ratio/mean": 1.0001170635223389, "sampling/importance_sampling_ratio/min": 0.6173411011695862, "sampling/sampling_logp_difference/max": 0.5000996589660645, "sampling/sampling_logp_difference/mean": 0.015895390883088112, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 191.234375, "completions/mean_terminated_length": 191.234375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.45986810326576233, "epoch": 1.4448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.8005740570694926, "kl": 0.05514213815331459, "learning_rate": 6.202696066849524e-07, "loss": 0.0067, "num_tokens": 37357225.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6609363555908203, "sampling/importance_sampling_ratio/mean": 1.000389575958252, "sampling/importance_sampling_ratio/min": 0.6299614310264587, "sampling/sampling_logp_difference/max": 0.5073815584182739, "sampling/sampling_logp_difference/mean": 0.01626450940966606, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 198.90625, "completions/mean_terminated_length": 198.90625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4463042616844177, "epoch": 1.446078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.8882024418683567, "kl": 0.031965792179107666, "learning_rate": 6.195780214303887e-07, "loss": -0.0144, "num_tokens": 37392835.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4765907526016235, "sampling/importance_sampling_ratio/mean": 0.9997777938842773, "sampling/importance_sampling_ratio/min": 0.6732635498046875, "sampling/sampling_logp_difference/max": 0.3956184387207031, "sampling/sampling_logp_difference/mean": 0.014926549047231674, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 179.453125, "completions/mean_terminated_length": 179.453125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4128044843673706, "epoch": 1.4473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0281274341082484, "kl": 0.04153367131948471, "learning_rate": 6.188861934400171e-07, "loss": 0.0004, "num_tokens": 37427808.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6002800464630127, "sampling/importance_sampling_ratio/mean": 1.000360131263733, "sampling/importance_sampling_ratio/min": 0.6093294620513916, "sampling/sampling_logp_difference/max": 0.49539613723754883, "sampling/sampling_logp_difference/mean": 0.01555293332785368, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 224.484375, "completions/mean_terminated_length": 224.484375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4386179447174072, "epoch": 1.4485294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.015855737822550973, "kl": 0.02755020745098591, "learning_rate": 6.181941241182043e-07, "loss": 0.0003, "num_tokens": 37464927.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.8085458278656006, "sampling/importance_sampling_ratio/mean": 1.0002679824829102, "sampling/importance_sampling_ratio/min": 0.5397461652755737, "sampling/sampling_logp_difference/max": 0.6166563034057617, "sampling/sampling_logp_difference/mean": 0.01591423712670803, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.4858577251434326, "epoch": 1.4497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0355042417143942, "kl": 0.05320306867361069, "learning_rate": 6.175018148698076e-07, "loss": 0.0005, "num_tokens": 37495119.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7035918235778809, "sampling/importance_sampling_ratio/mean": 1.0005650520324707, "sampling/importance_sampling_ratio/min": 0.6632429361343384, "sampling/sampling_logp_difference/max": 0.5327389240264893, "sampling/sampling_logp_difference/mean": 0.017133308574557304, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 209.59375, "completions/mean_terminated_length": 209.59375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5416671633720398, "epoch": 1.4509803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.1585298101868597, "kl": 0.05938192084431648, "learning_rate": 6.168092671001705e-07, "loss": -0.0202, "num_tokens": 37529157.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4024341106414795, "sampling/importance_sampling_ratio/mean": 1.0000582933425903, "sampling/importance_sampling_ratio/min": 0.6091784238815308, "sampling/sampling_logp_difference/max": 0.49564409255981445, "sampling/sampling_logp_difference/mean": 0.017426788806915283, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 162.890625, "completions/mean_terminated_length": 162.890625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.45169419050216675, "epoch": 1.4522058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.02232771027440117, "kl": 0.04014962911605835, "learning_rate": 6.161164822151213e-07, "loss": 0.0004, "num_tokens": 37558126.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3685723543167114, "sampling/importance_sampling_ratio/mean": 1.000321626663208, "sampling/importance_sampling_ratio/min": 0.6659936904907227, "sampling/sampling_logp_difference/max": 0.4064750671386719, "sampling/sampling_logp_difference/mean": 0.015808694064617157, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 235.78125, "completions/mean_terminated_length": 235.78125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3496420383453369, "epoch": 1.4534313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.015577012636220793, "kl": 0.026788845658302307, "learning_rate": 6.154234616209692e-07, "loss": 0.0003, "num_tokens": 37592784.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3528342247009277, "sampling/importance_sampling_ratio/mean": 0.9999212026596069, "sampling/importance_sampling_ratio/min": 0.6132686734199524, "sampling/sampling_logp_difference/max": 0.4889521598815918, "sampling/sampling_logp_difference/mean": 0.012509230524301529, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 208.71875, "completions/mean_terminated_length": 208.71875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.46052321791648865, "epoch": 1.454656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.018013028851222145, "kl": 0.02776549756526947, "learning_rate": 6.147302067245028e-07, "loss": 0.0003, "num_tokens": 37622414.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6190518140792847, "sampling/importance_sampling_ratio/mean": 0.9995525479316711, "sampling/importance_sampling_ratio/min": 0.6144495010375977, "sampling/sampling_logp_difference/max": 0.4870285987854004, "sampling/sampling_logp_difference/mean": 0.015121417120099068, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 203.21875, "completions/mean_terminated_length": 203.21875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4830366373062134, "epoch": 1.4558823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.02214156207247184, "kl": 0.04310956969857216, "learning_rate": 6.140367189329847e-07, "loss": 0.0004, "num_tokens": 37653148.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5278550386428833, "sampling/importance_sampling_ratio/mean": 0.9998105764389038, "sampling/importance_sampling_ratio/min": 0.7727299928665161, "sampling/sampling_logp_difference/max": 0.42386484146118164, "sampling/sampling_logp_difference/mean": 0.01657957024872303, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 186.40625, "completions/mean_terminated_length": 186.40625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4567105770111084, "epoch": 1.4571078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.025514641483272565, "kl": 0.04340451955795288, "learning_rate": 6.133429996541518e-07, "loss": 0.0004, "num_tokens": 37683542.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5277611017227173, "sampling/importance_sampling_ratio/mean": 0.9995805025100708, "sampling/importance_sampling_ratio/min": 0.7091161608695984, "sampling/sampling_logp_difference/max": 0.42380332946777344, "sampling/sampling_logp_difference/mean": 0.015402511693537235, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 205.140625, "completions/mean_terminated_length": 205.140625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.47519806027412415, "epoch": 1.4583333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.021531896020683963, "kl": 0.04695788025856018, "learning_rate": 6.1264905029621e-07, "loss": 0.0004, "num_tokens": 37718639.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.41250479221344, "sampling/importance_sampling_ratio/mean": 1.000076413154602, "sampling/importance_sampling_ratio/min": 0.6857278347015381, "sampling/sampling_logp_difference/max": 0.3772745132446289, "sampling/sampling_logp_difference/mean": 0.015646200627088547, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.43114298582077026, "epoch": 1.4595588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 1.1324860528711949, "kl": 0.05779781565070152, "learning_rate": 6.119548722678327e-07, "loss": 0.075, "num_tokens": 37752419.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4694956541061401, "sampling/importance_sampling_ratio/mean": 0.9997105598449707, "sampling/importance_sampling_ratio/min": 0.7481796741485596, "sampling/sampling_logp_difference/max": 0.3849191665649414, "sampling/sampling_logp_difference/mean": 0.014590677805244923, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 219.671875, "completions/mean_terminated_length": 219.671875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.36435285210609436, "epoch": 1.4607843137254901, "frac_reward_zero_std": 1.0, "grad_norm": 0.016175287540601237, "kl": 0.033833034336566925, "learning_rate": 6.112604669781572e-07, "loss": 0.0003, "num_tokens": 37783918.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3206326961517334, "sampling/importance_sampling_ratio/mean": 0.9999261498451233, "sampling/importance_sampling_ratio/min": 0.6813300251960754, "sampling/sampling_logp_difference/max": 0.38370847702026367, "sampling/sampling_logp_difference/mean": 0.012360158376395702, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 225.46875, "completions/mean_terminated_length": 225.46875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.49749669432640076, "epoch": 1.4620098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7798713022462089, "kl": 0.04543078690767288, "learning_rate": 6.105658358367822e-07, "loss": -0.0268, "num_tokens": 37817436.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3153043985366821, "sampling/importance_sampling_ratio/mean": 1.0001451969146729, "sampling/importance_sampling_ratio/min": 0.7269885540008545, "sampling/sampling_logp_difference/max": 0.3188445568084717, "sampling/sampling_logp_difference/mean": 0.01567385531961918, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 187.265625, "completions/mean_terminated_length": 187.265625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3612132966518402, "epoch": 1.4632352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.02322135424877072, "kl": 0.03864223137497902, "learning_rate": 6.098709802537653e-07, "loss": 0.0004, "num_tokens": 37842733.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4756741523742676, "sampling/importance_sampling_ratio/mean": 1.0002970695495605, "sampling/importance_sampling_ratio/min": 0.6045997738838196, "sampling/sampling_logp_difference/max": 0.5031886100769043, "sampling/sampling_logp_difference/mean": 0.015087596140801907, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 215.828125, "completions/mean_terminated_length": 215.828125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.41238224506378174, "epoch": 1.4644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.018129618192082097, "kl": 0.03457965701818466, "learning_rate": 6.091759016396188e-07, "loss": 0.0003, "num_tokens": 37873906.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4744254350662231, "sampling/importance_sampling_ratio/mean": 1.0000766515731812, "sampling/importance_sampling_ratio/min": 0.7046488523483276, "sampling/sampling_logp_difference/max": 0.3882683515548706, "sampling/sampling_logp_difference/mean": 0.014175968244671822, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 171.96875, "completions/mean_terminated_length": 171.96875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.44503602385520935, "epoch": 1.465686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.841634051797968, "kl": 0.05202612280845642, "learning_rate": 6.084806014053086e-07, "loss": 0.0064, "num_tokens": 37900976.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4928193092346191, "sampling/importance_sampling_ratio/mean": 0.9997214078903198, "sampling/importance_sampling_ratio/min": 0.5696046352386475, "sampling/sampling_logp_difference/max": 0.5628128051757812, "sampling/sampling_logp_difference/mean": 0.015186592936515808, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 207.109375, "completions/mean_terminated_length": 207.109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.31114035844802856, "epoch": 1.4669117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.02005669378796282, "kl": 0.033173758536577225, "learning_rate": 6.077850809622498e-07, "loss": 0.0003, "num_tokens": 37931767.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2876096963882446, "sampling/importance_sampling_ratio/mean": 0.9997863173484802, "sampling/importance_sampling_ratio/min": 0.6910658478736877, "sampling/sampling_logp_difference/max": 0.3695201873779297, "sampling/sampling_logp_difference/mean": 0.011576864868402481, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 166.953125, "completions/mean_terminated_length": 166.953125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5329830050468445, "epoch": 1.468137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.2504402916643644, "kl": 0.09681343287229538, "learning_rate": 6.070893417223052e-07, "loss": -0.017, "num_tokens": 37956324.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.568365216255188, "sampling/importance_sampling_ratio/mean": 0.9999660849571228, "sampling/importance_sampling_ratio/min": 0.6802662014961243, "sampling/sampling_logp_difference/max": 0.45003390312194824, "sampling/sampling_logp_difference/mean": 0.016995804384350777, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 226.265625, "completions/mean_terminated_length": 226.265625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.43442896008491516, "epoch": 1.469362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.1722368189735048, "kl": 0.04702545702457428, "learning_rate": 6.06393385097781e-07, "loss": -0.0327, "num_tokens": 37990965.0, "reward": 0.0625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.527762532234192, "sampling/importance_sampling_ratio/mean": 1.0005501508712769, "sampling/importance_sampling_ratio/min": 0.6057447791099548, "sampling/sampling_logp_difference/max": 0.5012965202331543, "sampling/sampling_logp_difference/mean": 0.013631860725581646, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3700704276561737, "epoch": 1.4705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0374565572533946, "kl": 0.04746583104133606, "learning_rate": 6.056972125014254e-07, "loss": 0.0005, "num_tokens": 38019325.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4623384475708008, "sampling/importance_sampling_ratio/mean": 1.0001246929168701, "sampling/importance_sampling_ratio/min": 0.48418575525283813, "sampling/sampling_logp_difference/max": 0.7252867221832275, "sampling/sampling_logp_difference/mean": 0.01417962834239006, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 229.296875, "completions/mean_terminated_length": 229.296875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.47815942764282227, "epoch": 1.471813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7273693398303055, "kl": 0.047664448618888855, "learning_rate": 6.050008253464246e-07, "loss": -0.05, "num_tokens": 38052688.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.627287745475769, "sampling/importance_sampling_ratio/mean": 0.9992881417274475, "sampling/importance_sampling_ratio/min": 0.6407450437545776, "sampling/sampling_logp_difference/max": 0.48691463470458984, "sampling/sampling_logp_difference/mean": 0.015799298882484436, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 196.28125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.5211617946624756, "epoch": 1.4730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.023797699100773104, "kl": 0.0537765696644783, "learning_rate": 6.043042250464004e-07, "loss": 0.0005, "num_tokens": 38087762.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5172595977783203, "sampling/importance_sampling_ratio/mean": 0.9997826814651489, "sampling/importance_sampling_ratio/min": 0.731545090675354, "sampling/sampling_logp_difference/max": 0.41690587997436523, "sampling/sampling_logp_difference/mean": 0.016150303184986115, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 176.109375, "completions/mean_terminated_length": 176.109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.5632998943328857, "epoch": 1.4742647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.8665966289860008, "kl": 0.12040524184703827, "learning_rate": 6.036074130154071e-07, "loss": 0.0063, "num_tokens": 38115785.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4598504304885864, "sampling/importance_sampling_ratio/mean": 1.000475287437439, "sampling/importance_sampling_ratio/min": 0.684594452381134, "sampling/sampling_logp_difference/max": 0.37892866134643555, "sampling/sampling_logp_difference/mean": 0.01630369946360588, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 213.109375, "completions/mean_terminated_length": 213.109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3936324119567871, "epoch": 1.4754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.018966758991156338, "kl": 0.03494948893785477, "learning_rate": 6.029103906679293e-07, "loss": 0.0003, "num_tokens": 38146064.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2813870906829834, "sampling/importance_sampling_ratio/mean": 1.0001485347747803, "sampling/importance_sampling_ratio/min": 0.7350302338600159, "sampling/sampling_logp_difference/max": 0.3078436851501465, "sampling/sampling_logp_difference/mean": 0.0130241010338068, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 190.40625, "completions/mean_terminated_length": 190.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.48241475224494934, "epoch": 1.4767156862745099, "frac_reward_zero_std": 1.0, "grad_norm": 0.02191195895971036, "kl": 0.042058248072862625, "learning_rate": 6.022131594188777e-07, "loss": 0.0004, "num_tokens": 38182682.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5201222896575928, "sampling/importance_sampling_ratio/mean": 0.9998643398284912, "sampling/importance_sampling_ratio/min": 0.7046812176704407, "sampling/sampling_logp_difference/max": 0.4187908172607422, "sampling/sampling_logp_difference/mean": 0.015262596309185028, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 188.109375, "completions/mean_terminated_length": 188.109375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.5057660937309265, "epoch": 1.4779411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.027054224214642, "kl": 0.056050047278404236, "learning_rate": 6.01515720683588e-07, "loss": 0.0005, "num_tokens": 38208337.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.605932593345642, "sampling/importance_sampling_ratio/mean": 0.9995064735412598, "sampling/importance_sampling_ratio/min": 0.6954973936080933, "sampling/sampling_logp_difference/max": 0.47370457649230957, "sampling/sampling_logp_difference/mean": 0.017247628420591354, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 181.390625, "completions/mean_terminated_length": 181.390625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4926803708076477, "epoch": 1.4791666666666667, "frac_reward_zero_std": 0.75, "grad_norm": 0.7028311180043693, "kl": 0.06302222609519958, "learning_rate": 6.008180758778166e-07, "loss": -0.0068, "num_tokens": 38237578.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.2768861055374146, "sampling/importance_sampling_ratio/mean": 0.9997392892837524, "sampling/importance_sampling_ratio/min": 0.6850432753562927, "sampling/sampling_logp_difference/max": 0.37827324867248535, "sampling/sampling_logp_difference/mean": 0.016209810972213745, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 196.796875, "completions/mean_terminated_length": 196.796875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.5411112904548645, "epoch": 1.4803921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.02817009766619767, "kl": 0.060022253543138504, "learning_rate": 6.001202264177382e-07, "loss": 0.0006, "num_tokens": 38269341.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6009783744812012, "sampling/importance_sampling_ratio/mean": 0.9997267127037048, "sampling/importance_sampling_ratio/min": 0.650458037853241, "sampling/sampling_logp_difference/max": 0.4706149101257324, "sampling/sampling_logp_difference/mean": 0.016653675585985184, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 217.90625, "completions/mean_terminated_length": 217.90625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.45477426052093506, "epoch": 1.4816176470588236, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109452365953674, "kl": 0.038713470101356506, "learning_rate": 5.99422173719943e-07, "loss": 0.0004, "num_tokens": 38300023.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8954544067382812, "sampling/importance_sampling_ratio/mean": 0.9999589920043945, "sampling/importance_sampling_ratio/min": 0.6483321189880371, "sampling/sampling_logp_difference/max": 0.6394586563110352, "sampling/sampling_logp_difference/mean": 0.01450974028557539, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 185.15625, "completions/mean_terminated_length": 185.15625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5922730565071106, "epoch": 1.482843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.027701588601640254, "kl": 0.07016173005104065, "learning_rate": 5.987239192014335e-07, "loss": 0.0007, "num_tokens": 38333857.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.542392611503601, "sampling/importance_sampling_ratio/mean": 1.0000760555267334, "sampling/importance_sampling_ratio/min": 0.5946744084358215, "sampling/sampling_logp_difference/max": 0.5197412967681885, "sampling/sampling_logp_difference/mean": 0.017837852239608765, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 185.6875, "completions/mean_terminated_length": 185.6875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.6171554327011108, "epoch": 1.4840686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.91756634392374, "kl": 0.10552050173282623, "learning_rate": 5.980254642796226e-07, "loss": 0.0181, "num_tokens": 38362701.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.287684679031372, "sampling/importance_sampling_ratio/mean": 0.9996305704116821, "sampling/importance_sampling_ratio/min": 0.6957054138183594, "sampling/sampling_logp_difference/max": 0.36282896995544434, "sampling/sampling_logp_difference/mean": 0.017641430720686913, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4842797815799713, "epoch": 1.4852941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.828278066243578, "kl": 0.059575796127319336, "learning_rate": 5.973268103723293e-07, "loss": -0.017, "num_tokens": 38388553.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3947969675064087, "sampling/importance_sampling_ratio/mean": 1.000115156173706, "sampling/importance_sampling_ratio/min": 0.6172971725463867, "sampling/sampling_logp_difference/max": 0.4824047088623047, "sampling/sampling_logp_difference/mean": 0.015291726216673851, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 243.53125, "completions/mean_terminated_length": 243.53125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5568035244941711, "epoch": 1.4865196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.9233342733029263, "kl": 0.05577544867992401, "learning_rate": 5.966279588977766e-07, "loss": 0.001, "num_tokens": 38424603.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.2761130332946777, "sampling/importance_sampling_ratio/mean": 1.000002384185791, "sampling/importance_sampling_ratio/min": 0.7563543319702148, "sampling/sampling_logp_difference/max": 0.27924537658691406, "sampling/sampling_logp_difference/mean": 0.015990786254405975, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 198.078125, "completions/mean_terminated_length": 198.078125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4018840193748474, "epoch": 1.4877450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8715222833589117, "kl": 0.0444413498044014, "learning_rate": 5.959289112745891e-07, "loss": -0.0033, "num_tokens": 38455408.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5815471410751343, "sampling/importance_sampling_ratio/mean": 1.0003597736358643, "sampling/importance_sampling_ratio/min": 0.6324144005775452, "sampling/sampling_logp_difference/max": 0.4584035873413086, "sampling/sampling_logp_difference/mean": 0.013145819306373596, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 189.5625, "completions/mean_terminated_length": 189.5625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4872584939002991, "epoch": 1.4889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.6873065312042486, "kl": 0.07594792544841766, "learning_rate": 5.952296689217889e-07, "loss": -0.0159, "num_tokens": 38483396.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.55042564868927, "sampling/importance_sampling_ratio/mean": 1.0008623600006104, "sampling/importance_sampling_ratio/min": 0.6311050057411194, "sampling/sampling_logp_difference/max": 0.4602830410003662, "sampling/sampling_logp_difference/mean": 0.01651753857731819, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 172.921875, "completions/mean_terminated_length": 172.921875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.39633241295814514, "epoch": 1.4901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.02532607640777845, "kl": 0.04810824990272522, "learning_rate": 5.945302332587938e-07, "loss": 0.0004, "num_tokens": 38511055.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4757611751556396, "sampling/importance_sampling_ratio/mean": 0.999756395816803, "sampling/importance_sampling_ratio/min": 0.6455199718475342, "sampling/sampling_logp_difference/max": 0.43769919872283936, "sampling/sampling_logp_difference/mean": 0.014180956408381462, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 228.609375, "completions/mean_terminated_length": 228.609375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.450186163187027, "epoch": 1.491421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.8566550667556685, "kl": 0.05234798416495323, "learning_rate": 5.938306057054138e-07, "loss": 0.0115, "num_tokens": 38542038.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2922401428222656, "sampling/importance_sampling_ratio/mean": 0.9993905425071716, "sampling/importance_sampling_ratio/min": 0.7026405930519104, "sampling/sampling_logp_difference/max": 0.35290980339050293, "sampling/sampling_logp_difference/mean": 0.013949232175946236, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 244.828125, "completions/mean_terminated_length": 244.828125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.37430262565612793, "epoch": 1.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0171551854812675, "kl": 0.03328785300254822, "learning_rate": 5.931307876818487e-07, "loss": 0.0003, "num_tokens": 38576619.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5551716089248657, "sampling/importance_sampling_ratio/mean": 0.9995331764221191, "sampling/importance_sampling_ratio/min": 0.6900882124900818, "sampling/sampling_logp_difference/max": 0.441585898399353, "sampling/sampling_logp_difference/mean": 0.012714147567749023, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 182.484375, "completions/mean_terminated_length": 182.484375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.542378306388855, "epoch": 1.4938725490196079, "frac_reward_zero_std": 0.25, "grad_norm": 1.5182456389303929, "kl": 0.07905251532793045, "learning_rate": 5.924307806086843e-07, "loss": 0.054, "num_tokens": 38604570.0, "reward": 0.5, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5722545385360718, "sampling/importance_sampling_ratio/mean": 1.0004451274871826, "sampling/importance_sampling_ratio/min": 0.6802780628204346, "sampling/sampling_logp_difference/max": 0.4525105953216553, "sampling/sampling_logp_difference/mean": 0.01747533492743969, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 182.28125, "completions/mean_terminated_length": 182.28125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4274052381515503, "epoch": 1.4950980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.066348521021786, "kl": 0.04405607283115387, "learning_rate": 5.917305859068911e-07, "loss": 0.0129, "num_tokens": 38633372.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.47055184841156, "sampling/importance_sampling_ratio/mean": 0.999843955039978, "sampling/importance_sampling_ratio/min": 0.7588876485824585, "sampling/sampling_logp_difference/max": 0.3856377601623535, "sampling/sampling_logp_difference/mean": 0.014215231873095036, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 218.109375, "completions/mean_terminated_length": 218.109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3771829903125763, "epoch": 1.4963235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7445440441300208, "kl": 0.040690891444683075, "learning_rate": 5.910302049978199e-07, "loss": 0.0228, "num_tokens": 38664627.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4985706806182861, "sampling/importance_sampling_ratio/mean": 0.9998940825462341, "sampling/importance_sampling_ratio/min": 0.6546953320503235, "sampling/sampling_logp_difference/max": 0.42358529567718506, "sampling/sampling_logp_difference/mean": 0.012551363557577133, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 158.46875, "completions/mean_terminated_length": 158.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.5464028120040894, "epoch": 1.4975490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.9294205998653842, "kl": 0.07251539826393127, "learning_rate": 5.903296393031995e-07, "loss": 0.0013, "num_tokens": 38692945.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2881121635437012, "sampling/importance_sampling_ratio/mean": 1.0003340244293213, "sampling/importance_sampling_ratio/min": 0.626869797706604, "sampling/sampling_logp_difference/max": 0.46701645851135254, "sampling/sampling_logp_difference/mean": 0.017922550439834595, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 203.46875, "completions/mean_terminated_length": 203.46875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.45503854751586914, "epoch": 1.4987745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0190737118625132, "kl": 0.03507302328944206, "learning_rate": 5.896288902451338e-07, "loss": 0.0003, "num_tokens": 38728687.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.930330753326416, "sampling/importance_sampling_ratio/mean": 0.9999499917030334, "sampling/importance_sampling_ratio/min": 0.6622678637504578, "sampling/sampling_logp_difference/max": 0.6576913595199585, "sampling/sampling_logp_difference/mean": 0.01384247001260519, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 174.015625, "completions/mean_terminated_length": 174.015625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3848019242286682, "epoch": 1.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.8979730657892885, "kl": 0.05828709900379181, "learning_rate": 5.88927959246099e-07, "loss": 0.0029, "num_tokens": 38755856.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4173165559768677, "sampling/importance_sampling_ratio/mean": 0.999789297580719, "sampling/importance_sampling_ratio/min": 0.36534881591796875, "sampling/sampling_logp_difference/max": 1.0069026947021484, "sampling/sampling_logp_difference/mean": 0.014410626143217087, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 179.328125, "completions/mean_terminated_length": 179.328125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.5048831105232239, "epoch": 1.5012254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.9254894215425021, "kl": 0.06378062814474106, "learning_rate": 5.882268477289408e-07, "loss": 0.029, "num_tokens": 38788613.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6207302808761597, "sampling/importance_sampling_ratio/mean": 1.0001567602157593, "sampling/importance_sampling_ratio/min": 0.6898323893547058, "sampling/sampling_logp_difference/max": 0.4828767776489258, "sampling/sampling_logp_difference/mean": 0.015225782990455627, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.47566431760787964, "epoch": 1.5024509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02140793104362899, "kl": 0.035288769751787186, "learning_rate": 5.875255571168709e-07, "loss": 0.0004, "num_tokens": 38817165.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4393155574798584, "sampling/importance_sampling_ratio/mean": 1.0006489753723145, "sampling/importance_sampling_ratio/min": 0.6677610874176025, "sampling/sampling_logp_difference/max": 0.4038248062133789, "sampling/sampling_logp_difference/mean": 0.016018467023968697, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 172.203125, "completions/mean_terminated_length": 172.203125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.42229175567626953, "epoch": 1.5036764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.023800846567112507, "kl": 0.048726920038461685, "learning_rate": 5.868240888334652e-07, "loss": 0.0005, "num_tokens": 38844458.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.283900499343872, "sampling/importance_sampling_ratio/mean": 0.9996721744537354, "sampling/importance_sampling_ratio/min": 0.6884435415267944, "sampling/sampling_logp_difference/max": 0.3733220100402832, "sampling/sampling_logp_difference/mean": 0.01524503342807293, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 220.796875, "completions/mean_terminated_length": 220.796875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.5032088160514832, "epoch": 1.5049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.021647534745950717, "kl": 0.05585585534572601, "learning_rate": 5.861224443026595e-07, "loss": 0.0005, "num_tokens": 38878813.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3067570924758911, "sampling/importance_sampling_ratio/mean": 0.9995409250259399, "sampling/importance_sampling_ratio/min": 0.703955352306366, "sampling/sampling_logp_difference/max": 0.3510403633117676, "sampling/sampling_logp_difference/mean": 0.015978462994098663, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 183.90625, "completions/mean_terminated_length": 183.90625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4661997854709625, "epoch": 1.5061274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.7744045584456632, "kl": 0.06239331141114235, "learning_rate": 5.854206249487478e-07, "loss": -0.0115, "num_tokens": 38905527.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5400595664978027, "sampling/importance_sampling_ratio/mean": 1.0004256963729858, "sampling/importance_sampling_ratio/min": 0.6938456892967224, "sampling/sampling_logp_difference/max": 0.4318211078643799, "sampling/sampling_logp_difference/mean": 0.015041721984744072, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 197.390625, "completions/mean_terminated_length": 197.390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3444875180721283, "epoch": 1.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.02053351113351947, "kl": 0.036030255258083344, "learning_rate": 5.847186321963792e-07, "loss": 0.0003, "num_tokens": 38936752.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2809301614761353, "sampling/importance_sampling_ratio/mean": 0.9998122453689575, "sampling/importance_sampling_ratio/min": 0.6104072332382202, "sampling/sampling_logp_difference/max": 0.49362897872924805, "sampling/sampling_logp_difference/mean": 0.012584058567881584, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 196.203125, "completions/mean_terminated_length": 196.203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4444473087787628, "epoch": 1.508578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.025489591186659628, "kl": 0.035917483270168304, "learning_rate": 5.840164674705542e-07, "loss": 0.0003, "num_tokens": 38967997.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4047203063964844, "sampling/importance_sampling_ratio/mean": 0.999308705329895, "sampling/importance_sampling_ratio/min": 0.5704594850540161, "sampling/sampling_logp_difference/max": 0.5613131523132324, "sampling/sampling_logp_difference/mean": 0.01690061390399933, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 185.21875, "completions/mean_terminated_length": 185.21875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.39085036516189575, "epoch": 1.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02106353212827986, "kl": 0.03460868448019028, "learning_rate": 5.833141321966228e-07, "loss": 0.0003, "num_tokens": 39000619.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5278422832489014, "sampling/importance_sampling_ratio/mean": 0.9994332790374756, "sampling/importance_sampling_ratio/min": 0.6895950436592102, "sampling/sampling_logp_difference/max": 0.4238564968109131, "sampling/sampling_logp_difference/mean": 0.014932571910321712, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 164.390625, "completions/mean_terminated_length": 164.390625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3618762493133545, "epoch": 1.5110294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.020524870792948353, "kl": 0.03939305990934372, "learning_rate": 5.826116278002813e-07, "loss": 0.0004, "num_tokens": 39025684.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.434867262840271, "sampling/importance_sampling_ratio/mean": 0.9991062879562378, "sampling/importance_sampling_ratio/min": 0.6994453072547913, "sampling/sampling_logp_difference/max": 0.361072301864624, "sampling/sampling_logp_difference/mean": 0.014167373068630695, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 211.9375, "completions/mean_terminated_length": 211.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4231787621974945, "epoch": 1.5122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.7374625059211084, "kl": 0.04008599370718002, "learning_rate": 5.819089557075688e-07, "loss": -0.0103, "num_tokens": 39057216.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5467814207077026, "sampling/importance_sampling_ratio/mean": 1.0004159212112427, "sampling/importance_sampling_ratio/min": 0.6743151545524597, "sampling/sampling_logp_difference/max": 0.4361763000488281, "sampling/sampling_logp_difference/mean": 0.015356146730482578, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 197.40625, "completions/mean_terminated_length": 197.40625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.39275944232940674, "epoch": 1.5134803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7440143713536784, "kl": 0.054746758192777634, "learning_rate": 5.812061173448654e-07, "loss": -0.0005, "num_tokens": 39090490.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8623653650283813, "sampling/importance_sampling_ratio/mean": 0.9998970627784729, "sampling/importance_sampling_ratio/min": 0.6154475808143616, "sampling/sampling_logp_difference/max": 0.62184739112854, "sampling/sampling_logp_difference/mean": 0.015792280435562134, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 204.71875, "completions/mean_terminated_length": 204.71875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.40649598836898804, "epoch": 1.5147058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.9315226544513092, "kl": 0.03523103892803192, "learning_rate": 5.805031141388883e-07, "loss": 0.0305, "num_tokens": 39124760.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.438776969909668, "sampling/importance_sampling_ratio/mean": 1.0000498294830322, "sampling/importance_sampling_ratio/min": 0.6908988356590271, "sampling/sampling_logp_difference/max": 0.36976194381713867, "sampling/sampling_logp_difference/mean": 0.014396263286471367, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3641846776008606, "epoch": 1.5159313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.7727836115847848, "kl": 0.04704434424638748, "learning_rate": 5.797999475166896e-07, "loss": -0.0046, "num_tokens": 39168782.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5360101461410522, "sampling/importance_sampling_ratio/mean": 0.9998215436935425, "sampling/importance_sampling_ratio/min": 0.7178670763969421, "sampling/sampling_logp_difference/max": 0.42918825149536133, "sampling/sampling_logp_difference/mean": 0.012815626338124275, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 194.203125, "completions/mean_terminated_length": 194.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4459923505783081, "epoch": 1.517156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7709668361144417, "kl": 0.05876849964261055, "learning_rate": 5.790966189056529e-07, "loss": -0.0015, "num_tokens": 39200987.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5467361211776733, "sampling/importance_sampling_ratio/mean": 0.9998927116394043, "sampling/importance_sampling_ratio/min": 0.6045164465904236, "sampling/sampling_logp_difference/max": 0.503326416015625, "sampling/sampling_logp_difference/mean": 0.01612141728401184, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 213.5625, "completions/mean_terminated_length": 213.5625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.468833863735199, "epoch": 1.5183823529411766, "frac_reward_zero_std": 0.25, "grad_norm": 1.3745813326523788, "kl": 0.043368056416511536, "learning_rate": 5.783931297334907e-07, "loss": 0.0629, "num_tokens": 39237087.0, "reward": 0.8125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4521344900131226, "sampling/importance_sampling_ratio/mean": 0.9998012185096741, "sampling/importance_sampling_ratio/min": 0.6323934197425842, "sampling/sampling_logp_difference/max": 0.45824360847473145, "sampling/sampling_logp_difference/mean": 0.015378328040242195, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 215.53125, "completions/mean_terminated_length": 215.53125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.47365593910217285, "epoch": 1.5196078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.02133304870081858, "kl": 0.055505771189928055, "learning_rate": 5.776894814282415e-07, "loss": 0.0004, "num_tokens": 39268145.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4176025390625, "sampling/importance_sampling_ratio/mean": 0.9997040033340454, "sampling/importance_sampling_ratio/min": 0.689681887626648, "sampling/sampling_logp_difference/max": 0.3715248107910156, "sampling/sampling_logp_difference/mean": 0.016383890062570572, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 177.15625, "completions/mean_terminated_length": 177.15625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.44920074939727783, "epoch": 1.5208333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.02596466359665803, "kl": 0.036328621208667755, "learning_rate": 5.769856754182667e-07, "loss": 0.0004, "num_tokens": 39298299.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4447506666183472, "sampling/importance_sampling_ratio/mean": 0.9996228218078613, "sampling/importance_sampling_ratio/min": 0.6075159907341003, "sampling/sampling_logp_difference/max": 0.49837684631347656, "sampling/sampling_logp_difference/mean": 0.016213443130254745, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 177.40625, "completions/mean_terminated_length": 177.40625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.406511127948761, "epoch": 1.5220588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.935093556595975, "kl": 0.03980601578950882, "learning_rate": 5.762817131322481e-07, "loss": 0.0056, "num_tokens": 39326789.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.405997633934021, "sampling/importance_sampling_ratio/mean": 1.0002541542053223, "sampling/importance_sampling_ratio/min": 0.6896554231643677, "sampling/sampling_logp_difference/max": 0.371563196182251, "sampling/sampling_logp_difference/mean": 0.016482625156641006, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 181.84375, "completions/mean_terminated_length": 181.84375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.41599082946777344, "epoch": 1.5232843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7021961826808495, "kl": 0.04142145812511444, "learning_rate": 5.755775959991844e-07, "loss": -0.0041, "num_tokens": 39356171.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2728204727172852, "sampling/importance_sampling_ratio/mean": 0.9999167323112488, "sampling/importance_sampling_ratio/min": 0.6699854135513306, "sampling/sampling_logp_difference/max": 0.4004993438720703, "sampling/sampling_logp_difference/mean": 0.014757356606423855, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 142.78125, "completions/mean_terminated_length": 142.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4153823256492615, "epoch": 1.5245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.03757600758983537, "kl": 0.0534350611269474, "learning_rate": 5.74873325448389e-07, "loss": 0.0005, "num_tokens": 39380349.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5157074928283691, "sampling/importance_sampling_ratio/mean": 1.0008015632629395, "sampling/importance_sampling_ratio/min": 0.6309089660644531, "sampling/sampling_logp_difference/max": 0.46059370040893555, "sampling/sampling_logp_difference/mean": 0.01728636398911476, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 162.4375, "completions/mean_terminated_length": 162.4375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.38047099113464355, "epoch": 1.5257352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.027273282927566717, "kl": 0.03636075556278229, "learning_rate": 5.741689029094861e-07, "loss": 0.0003, "num_tokens": 39407177.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.600734829902649, "sampling/importance_sampling_ratio/mean": 1.000301480293274, "sampling/importance_sampling_ratio/min": 0.6788877248764038, "sampling/sampling_logp_difference/max": 0.4704627990722656, "sampling/sampling_logp_difference/mean": 0.014349868521094322, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 185.3125, "completions/mean_terminated_length": 185.3125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3892589211463928, "epoch": 1.5269607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.9945046565892149, "kl": 0.05229416489601135, "learning_rate": 5.73464329812409e-07, "loss": -0.0324, "num_tokens": 39433757.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.7655868530273438, "sampling/importance_sampling_ratio/mean": 1.0002690553665161, "sampling/importance_sampling_ratio/min": 0.7580675482749939, "sampling/sampling_logp_difference/max": 0.5684831142425537, "sampling/sampling_logp_difference/mean": 0.015093202702701092, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3466736078262329, "epoch": 1.528186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.8782710940419933, "kl": 0.04014609381556511, "learning_rate": 5.727596075873965e-07, "loss": 0.0098, "num_tokens": 39458949.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4058247804641724, "sampling/importance_sampling_ratio/mean": 1.0001182556152344, "sampling/importance_sampling_ratio/min": 0.6978297829627991, "sampling/sampling_logp_difference/max": 0.35978007316589355, "sampling/sampling_logp_difference/mean": 0.013260153122246265, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 148.21875, "completions/mean_terminated_length": 148.21875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.41673874855041504, "epoch": 1.5294117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.115362662571301, "kl": 0.07008904218673706, "learning_rate": 5.7205473766499e-07, "loss": -0.0083, "num_tokens": 39488035.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.582716941833496, "sampling/importance_sampling_ratio/mean": 1.0004963874816895, "sampling/importance_sampling_ratio/min": 0.4275853931903839, "sampling/sampling_logp_difference/max": 0.8496012687683105, "sampling/sampling_logp_difference/mean": 0.017482828348875046, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 157.703125, "completions/mean_terminated_length": 157.703125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.42711418867111206, "epoch": 1.530637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.8976514209124797, "kl": 0.06900682300329208, "learning_rate": 5.71349721476031e-07, "loss": 0.0061, "num_tokens": 39518224.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6055638790130615, "sampling/importance_sampling_ratio/mean": 1.0001951456069946, "sampling/importance_sampling_ratio/min": 0.6225906610488892, "sampling/sampling_logp_difference/max": 0.47386598587036133, "sampling/sampling_logp_difference/mean": 0.015635859221220016, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4811854362487793, "epoch": 1.531862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.023626790688363004, "kl": 0.04002731293439865, "learning_rate": 5.706445604516574e-07, "loss": 0.0004, "num_tokens": 39561032.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3421891927719116, "sampling/importance_sampling_ratio/mean": 0.9997655153274536, "sampling/importance_sampling_ratio/min": 0.6639184355735779, "sampling/sampling_logp_difference/max": 0.40959596633911133, "sampling/sampling_logp_difference/mean": 0.01641765981912613, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 164.359375, "completions/mean_terminated_length": 164.359375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.4115786552429199, "epoch": 1.5330882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.9757896386048582, "kl": 0.08084049820899963, "learning_rate": 5.699392560233017e-07, "loss": 0.0328, "num_tokens": 39588863.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999293088912964, "sampling/importance_sampling_ratio/min": 0.5453224778175354, "sampling/sampling_logp_difference/max": 0.8030078411102295, "sampling/sampling_logp_difference/mean": 0.01555370632559061, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3465157747268677, "epoch": 1.534313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.02178792693278117, "kl": 0.03237396106123924, "learning_rate": 5.69233809622687e-07, "loss": 0.0003, "num_tokens": 39617035.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.524991750717163, "sampling/importance_sampling_ratio/mean": 1.0002695322036743, "sampling/importance_sampling_ratio/min": 0.714755654335022, "sampling/sampling_logp_difference/max": 0.42198896408081055, "sampling/sampling_logp_difference/mean": 0.013200388289988041, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 173.296875, "completions/mean_terminated_length": 173.296875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3633451461791992, "epoch": 1.5355392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.020463538693140625, "kl": 0.035572197288274765, "learning_rate": 5.685282226818249e-07, "loss": 0.0003, "num_tokens": 39649118.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4389134645462036, "sampling/importance_sampling_ratio/mean": 0.9997230768203735, "sampling/importance_sampling_ratio/min": 0.6744645237922668, "sampling/sampling_logp_difference/max": 0.39383625984191895, "sampling/sampling_logp_difference/mean": 0.014470847323536873, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 161.484375, "completions/mean_terminated_length": 161.484375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.47554466128349304, "epoch": 1.5367647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.254555955122257, "kl": 0.03972441703081131, "learning_rate": 5.678224966330119e-07, "loss": -0.0012, "num_tokens": 39680525.0, "reward": 0.3125, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.9994322657585144, "sampling/importance_sampling_ratio/min": 0.7094095349311829, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.015320624224841595, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.40946847200393677, "epoch": 1.5379901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.02762295709941761, "kl": 0.047129470854997635, "learning_rate": 5.671166329088277e-07, "loss": 0.0004, "num_tokens": 39706497.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.292691946029663, "sampling/importance_sampling_ratio/mean": 0.9992033243179321, "sampling/importance_sampling_ratio/min": 0.69576495885849, "sampling/sampling_logp_difference/max": 0.3627433776855469, "sampling/sampling_logp_difference/mean": 0.01570296101272106, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 174.609375, "completions/mean_terminated_length": 174.609375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.35514920949935913, "epoch": 1.5392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.01934494958303013, "kl": 0.029100321233272552, "learning_rate": 5.664106329421305e-07, "loss": 0.0003, "num_tokens": 39735928.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4580605030059814, "sampling/importance_sampling_ratio/mean": 1.0000929832458496, "sampling/importance_sampling_ratio/min": 0.6222665905952454, "sampling/sampling_logp_difference/max": 0.47438669204711914, "sampling/sampling_logp_difference/mean": 0.012476583942770958, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 195.078125, "completions/mean_terminated_length": 195.078125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.43784743547439575, "epoch": 1.5404411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.0638600413008341, "kl": 0.04495566338300705, "learning_rate": 5.657044981660559e-07, "loss": 0.0113, "num_tokens": 39768541.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5521928071975708, "sampling/importance_sampling_ratio/mean": 0.9999226331710815, "sampling/importance_sampling_ratio/min": 0.6645677089691162, "sampling/sampling_logp_difference/max": 0.4396686553955078, "sampling/sampling_logp_difference/mean": 0.015002140775322914, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 156.09375, "completions/mean_terminated_length": 156.09375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.42065680027008057, "epoch": 1.5416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.02766408901112148, "kl": 0.04885207861661911, "learning_rate": 5.649982300140123e-07, "loss": 0.0005, "num_tokens": 39795427.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4181545972824097, "sampling/importance_sampling_ratio/mean": 0.999775767326355, "sampling/importance_sampling_ratio/min": 0.7194200754165649, "sampling/sampling_logp_difference/max": 0.34935641288757324, "sampling/sampling_logp_difference/mean": 0.016253039240837097, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 149.328125, "completions/mean_terminated_length": 149.328125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3208079934120178, "epoch": 1.5428921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.038933861003191204, "kl": 0.04223839193582535, "learning_rate": 5.642918299196796e-07, "loss": 0.0004, "num_tokens": 39819848.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3146650791168213, "sampling/importance_sampling_ratio/mean": 0.9999975562095642, "sampling/importance_sampling_ratio/min": 0.7336270809173584, "sampling/sampling_logp_difference/max": 0.30975449085235596, "sampling/sampling_logp_difference/mean": 0.01251753605902195, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 146.109375, "completions/mean_terminated_length": 146.109375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3680001199245453, "epoch": 1.5441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.026802418251748644, "kl": 0.04381459206342697, "learning_rate": 5.635852993170052e-07, "loss": 0.0004, "num_tokens": 39842927.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5502644777297974, "sampling/importance_sampling_ratio/mean": 1.000135898590088, "sampling/importance_sampling_ratio/min": 0.6775410175323486, "sampling/sampling_logp_difference/max": 0.43842554092407227, "sampling/sampling_logp_difference/mean": 0.015402581542730331, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.39454811811447144, "epoch": 1.545343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.02253259241212441, "kl": 0.036204319447278976, "learning_rate": 5.628786396402013e-07, "loss": 0.0004, "num_tokens": 39873771.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6648271083831787, "sampling/importance_sampling_ratio/mean": 1.0001277923583984, "sampling/importance_sampling_ratio/min": 0.6251299977302551, "sampling/sampling_logp_difference/max": 0.5097212791442871, "sampling/sampling_logp_difference/mean": 0.014238608069717884, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 139.796875, "completions/mean_terminated_length": 139.796875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.31300443410873413, "epoch": 1.5465686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.02311308752613259, "kl": 0.03415478765964508, "learning_rate": 5.621718523237426e-07, "loss": 0.0003, "num_tokens": 39898766.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3027833700180054, "sampling/importance_sampling_ratio/mean": 1.0000433921813965, "sampling/importance_sampling_ratio/min": 0.6423305869102478, "sampling/sampling_logp_difference/max": 0.44265222549438477, "sampling/sampling_logp_difference/mean": 0.012356936000287533, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 197.3125, "completions/mean_terminated_length": 197.3125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3989219069480896, "epoch": 1.5477941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.8978740474774581, "kl": 0.05985507369041443, "learning_rate": 5.614649388023622e-07, "loss": 0.0225, "num_tokens": 39928338.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3391810655593872, "sampling/importance_sampling_ratio/mean": 0.9998857975006104, "sampling/importance_sampling_ratio/min": 0.7125557065010071, "sampling/sampling_logp_difference/max": 0.3388972282409668, "sampling/sampling_logp_difference/mean": 0.014779426157474518, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 183.640625, "completions/mean_terminated_length": 183.640625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.30833882093429565, "epoch": 1.5490196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.019367216326750366, "kl": 0.02864927239716053, "learning_rate": 5.607579005110502e-07, "loss": 0.0003, "num_tokens": 39955419.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5163096189498901, "sampling/importance_sampling_ratio/mean": 0.9996656179428101, "sampling/importance_sampling_ratio/min": 0.6625163555145264, "sampling/sampling_logp_difference/max": 0.41627955436706543, "sampling/sampling_logp_difference/mean": 0.012281282804906368, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 180.03125, "completions/mean_terminated_length": 180.03125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3912290930747986, "epoch": 1.5502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8134361518544091, "kl": 0.0405205562710762, "learning_rate": 5.60050738885049e-07, "loss": -0.0021, "num_tokens": 39984301.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.362772822380066, "sampling/importance_sampling_ratio/mean": 0.9998492002487183, "sampling/importance_sampling_ratio/min": 0.744883120059967, "sampling/sampling_logp_difference/max": 0.3095214366912842, "sampling/sampling_logp_difference/mean": 0.015420593321323395, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 212.90625, "completions/mean_terminated_length": 212.90625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4786534011363983, "epoch": 1.5514705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.020424667846182472, "kl": 0.035735562443733215, "learning_rate": 5.593434553598525e-07, "loss": 0.0004, "num_tokens": 40017591.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.328540325164795, "sampling/importance_sampling_ratio/mean": 0.9997981786727905, "sampling/importance_sampling_ratio/min": 0.6714566349983215, "sampling/sampling_logp_difference/max": 0.39830589294433594, "sampling/sampling_logp_difference/mean": 0.017017975449562073, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 190.828125, "completions/mean_terminated_length": 190.828125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.448546826839447, "epoch": 1.5526960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.789071005407192, "kl": 0.03808409720659256, "learning_rate": 5.586360513712009e-07, "loss": 0.0002, "num_tokens": 40045132.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.361933946609497, "sampling/importance_sampling_ratio/mean": 1.0002695322036743, "sampling/importance_sampling_ratio/min": 0.617332935333252, "sampling/sampling_logp_difference/max": 0.482346773147583, "sampling/sampling_logp_difference/mean": 0.016598699614405632, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 193.140625, "completions/mean_terminated_length": 193.140625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.30193161964416504, "epoch": 1.553921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.016319502379357587, "kl": 0.024532688781619072, "learning_rate": 5.579285283550797e-07, "loss": 0.0002, "num_tokens": 40077301.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4356786012649536, "sampling/importance_sampling_ratio/mean": 0.9999880194664001, "sampling/importance_sampling_ratio/min": 0.6256784200668335, "sampling/sampling_logp_difference/max": 0.4689188003540039, "sampling/sampling_logp_difference/mean": 0.011309798806905746, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 209.421875, "completions/mean_terminated_length": 209.421875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3677401840686798, "epoch": 1.5551470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015574414670255563, "kl": 0.03165864199399948, "learning_rate": 5.572208877477159e-07, "loss": 0.0003, "num_tokens": 40110480.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4888417720794678, "sampling/importance_sampling_ratio/mean": 0.9999098777770996, "sampling/importance_sampling_ratio/min": 0.4572525918483734, "sampling/sampling_logp_difference/max": 0.7825193405151367, "sampling/sampling_logp_difference/mean": 0.014508018270134926, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 149.328125, "completions/mean_terminated_length": 149.328125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.37257325649261475, "epoch": 1.5563725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.022973999136349448, "kl": 0.04484736546874046, "learning_rate": 5.565131309855752e-07, "loss": 0.0004, "num_tokens": 40138037.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.54764986038208, "sampling/importance_sampling_ratio/mean": 1.0002665519714355, "sampling/importance_sampling_ratio/min": 0.6457412242889404, "sampling/sampling_logp_difference/max": 0.43735647201538086, "sampling/sampling_logp_difference/mean": 0.015225168317556381, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 226.234375, "completions/mean_terminated_length": 226.234375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.37977835536003113, "epoch": 1.5575980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.6355925387914, "kl": 0.03878624737262726, "learning_rate": 5.558052595053586e-07, "loss": -0.0043, "num_tokens": 40176532.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5467371940612793, "sampling/importance_sampling_ratio/mean": 1.0005635023117065, "sampling/importance_sampling_ratio/min": 0.6368696689605713, "sampling/sampling_logp_difference/max": 0.4511902332305908, "sampling/sampling_logp_difference/mean": 0.013250889256596565, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.33750832080841064, "epoch": 1.5588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01839720443880899, "kl": 0.030976727604866028, "learning_rate": 5.550972747440005e-07, "loss": 0.0003, "num_tokens": 40203404.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6207518577575684, "sampling/importance_sampling_ratio/mean": 1.0003291368484497, "sampling/importance_sampling_ratio/min": 0.6080752611160278, "sampling/sampling_logp_difference/max": 0.49745655059814453, "sampling/sampling_logp_difference/mean": 0.014426854439079762, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 265.71875, "completions/mean_terminated_length": 265.71875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.46240752935409546, "epoch": 1.5600490196078431, "frac_reward_zero_std": 0.5, "grad_norm": 0.9579825277156595, "kl": 0.05170414596796036, "learning_rate": 5.543891781386655e-07, "loss": 0.0534, "num_tokens": 40245946.0, "reward": 0.40625, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3931963443756104, "sampling/importance_sampling_ratio/mean": 0.9996916055679321, "sampling/importance_sampling_ratio/min": 0.6954973936080933, "sampling/sampling_logp_difference/max": 0.3631279468536377, "sampling/sampling_logp_difference/mean": 0.014100045897066593, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.34568995237350464, "epoch": 1.5612745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.026876653440361203, "kl": 0.03240996599197388, "learning_rate": 5.536809711267443e-07, "loss": 0.0003, "num_tokens": 40274010.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6146008968353271, "sampling/importance_sampling_ratio/mean": 1.0002329349517822, "sampling/importance_sampling_ratio/min": 0.6587986946105957, "sampling/sampling_logp_difference/max": 0.47908782958984375, "sampling/sampling_logp_difference/mean": 0.013071177527308464, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 195.96875, "completions/mean_terminated_length": 195.96875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.419927179813385, "epoch": 1.5625, "frac_reward_zero_std": 1.0, "grad_norm": 0.022425113760107574, "kl": 0.03857335448265076, "learning_rate": 5.529726551458526e-07, "loss": 0.0004, "num_tokens": 40307144.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4335927963256836, "sampling/importance_sampling_ratio/mean": 0.9999467730522156, "sampling/importance_sampling_ratio/min": 0.6542967557907104, "sampling/sampling_logp_difference/max": 0.4241943359375, "sampling/sampling_logp_difference/mean": 0.015529746189713478, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 213.015625, "completions/mean_terminated_length": 213.015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4056742191314697, "epoch": 1.5637254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0232241560707057, "kl": 0.03813936561346054, "learning_rate": 5.522642316338268e-07, "loss": 0.0003, "num_tokens": 40338713.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6377936601638794, "sampling/importance_sampling_ratio/mean": 1.000185489654541, "sampling/importance_sampling_ratio/min": 0.6237207651138306, "sampling/sampling_logp_difference/max": 0.4933500289916992, "sampling/sampling_logp_difference/mean": 0.014487029053270817, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 163.84375, "completions/mean_terminated_length": 163.84375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4125819802284241, "epoch": 1.5649509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.030463973536295753, "kl": 0.061399657279253006, "learning_rate": 5.515557020287218e-07, "loss": 0.0006, "num_tokens": 40364751.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6288862228393555, "sampling/importance_sampling_ratio/mean": 1.0000745058059692, "sampling/importance_sampling_ratio/min": 0.7075063586235046, "sampling/sampling_logp_difference/max": 0.4878964424133301, "sampling/sampling_logp_difference/mean": 0.015359732322394848, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4101026654243469, "epoch": 1.5661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03697423516826225, "kl": 0.06247444078326225, "learning_rate": 5.508470677688078e-07, "loss": 0.0006, "num_tokens": 40397063.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3432215452194214, "sampling/importance_sampling_ratio/mean": 0.9993553161621094, "sampling/importance_sampling_ratio/min": 0.6600155234336853, "sampling/sampling_logp_difference/max": 0.4154919385910034, "sampling/sampling_logp_difference/mean": 0.014263832941651344, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 212.21875, "completions/mean_terminated_length": 212.21875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.48836657404899597, "epoch": 1.5674019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7521601665316348, "kl": 0.05656624212861061, "learning_rate": 5.501383302925677e-07, "loss": -0.0254, "num_tokens": 40433349.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6507452726364136, "sampling/importance_sampling_ratio/mean": 0.9997700452804565, "sampling/importance_sampling_ratio/min": 0.6106220483779907, "sampling/sampling_logp_difference/max": 0.5012269020080566, "sampling/sampling_logp_difference/mean": 0.01653190888464451, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 221.34375, "completions/mean_terminated_length": 221.34375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.43000924587249756, "epoch": 1.5686274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.020274630867750474, "kl": 0.040560007095336914, "learning_rate": 5.494294910386933e-07, "loss": 0.0004, "num_tokens": 40470571.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7524255514144897, "sampling/importance_sampling_ratio/mean": 1.000464677810669, "sampling/importance_sampling_ratio/min": 0.5848381519317627, "sampling/sampling_logp_difference/max": 0.5610008239746094, "sampling/sampling_logp_difference/mean": 0.015560484491288662, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 155.296875, "completions/mean_terminated_length": 155.296875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.42848485708236694, "epoch": 1.5698529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.029067613428986974, "kl": 0.049033183604478836, "learning_rate": 5.487205514460835e-07, "loss": 0.0005, "num_tokens": 40496974.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4896117448806763, "sampling/importance_sampling_ratio/mean": 1.0005073547363281, "sampling/importance_sampling_ratio/min": 0.618243396282196, "sampling/sampling_logp_difference/max": 0.48087310791015625, "sampling/sampling_logp_difference/mean": 0.015883062034845352, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 149.21875, "completions/mean_terminated_length": 149.21875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.3263126611709595, "epoch": 1.571078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.034101301532964895, "kl": 0.044961195439100266, "learning_rate": 5.480115129538409e-07, "loss": 0.0004, "num_tokens": 40525420.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.405886173248291, "sampling/importance_sampling_ratio/mean": 1.0003024339675903, "sampling/importance_sampling_ratio/min": 0.6408926248550415, "sampling/sampling_logp_difference/max": 0.44489336013793945, "sampling/sampling_logp_difference/mean": 0.013876670971512794, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 189.859375, "completions/mean_terminated_length": 189.859375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.41377168893814087, "epoch": 1.5723039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02701900619995695, "kl": 0.05792352557182312, "learning_rate": 5.473023770012686e-07, "loss": 0.0006, "num_tokens": 40553427.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000136137008667, "sampling/importance_sampling_ratio/min": 0.6201323866844177, "sampling/sampling_logp_difference/max": 0.7010538578033447, "sampling/sampling_logp_difference/mean": 0.016421273350715637, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 197.6875, "completions/mean_terminated_length": 197.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4023253917694092, "epoch": 1.5735294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.023264841923209056, "kl": 0.04238691180944443, "learning_rate": 5.465931450278676e-07, "loss": 0.0004, "num_tokens": 40585567.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.303682565689087, "sampling/importance_sampling_ratio/mean": 0.9998633861541748, "sampling/importance_sampling_ratio/min": 0.6077167391777039, "sampling/sampling_logp_difference/max": 0.4980463981628418, "sampling/sampling_logp_difference/mean": 0.014768811874091625, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 262.921875, "completions/mean_terminated_length": 262.921875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4775471091270447, "epoch": 1.5747549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01923680643591346, "kl": 0.04368182271718979, "learning_rate": 5.458838184733341e-07, "loss": 0.0004, "num_tokens": 40618570.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4640520811080933, "sampling/importance_sampling_ratio/mean": 1.0002003908157349, "sampling/importance_sampling_ratio/min": 0.6963002681732178, "sampling/sampling_logp_difference/max": 0.3812079429626465, "sampling/sampling_logp_difference/mean": 0.015857627615332603, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 189.203125, "completions/mean_terminated_length": 189.203125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3754490613937378, "epoch": 1.5759803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.7177864859107354, "kl": 0.034951746463775635, "learning_rate": 5.451743987775559e-07, "loss": -0.0027, "num_tokens": 40649975.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3653631210327148, "sampling/importance_sampling_ratio/mean": 0.9998676776885986, "sampling/importance_sampling_ratio/min": 0.6218180656433105, "sampling/sampling_logp_difference/max": 0.47510766983032227, "sampling/sampling_logp_difference/mean": 0.013944664038717747, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 225.65625, "completions/mean_terminated_length": 225.65625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4051673412322998, "epoch": 1.5772058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.5990588312894888, "kl": 0.043059948831796646, "learning_rate": 5.444648873806101e-07, "loss": -0.0108, "num_tokens": 40679457.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6007344722747803, "sampling/importance_sampling_ratio/mean": 0.9999345541000366, "sampling/importance_sampling_ratio/min": 0.5071929097175598, "sampling/sampling_logp_difference/max": 0.6788637638092041, "sampling/sampling_logp_difference/mean": 0.015464607626199722, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 178.6875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.33906644582748413, "epoch": 1.5784313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.7745910230403468, "kl": 0.029677651822566986, "learning_rate": 5.437552857227597e-07, "loss": -0.0025, "num_tokens": 40707453.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.436586618423462, "sampling/importance_sampling_ratio/mean": 0.9997626543045044, "sampling/importance_sampling_ratio/min": 0.6366055011749268, "sampling/sampling_logp_difference/max": 0.45160508155822754, "sampling/sampling_logp_difference/mean": 0.01307828351855278, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 203.234375, "completions/mean_terminated_length": 203.234375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4389653205871582, "epoch": 1.579656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.6227759495627972, "kl": 0.04993196576833725, "learning_rate": 5.430455952444512e-07, "loss": -0.0091, "num_tokens": 40732556.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4736192226409912, "sampling/importance_sampling_ratio/mean": 0.9992948770523071, "sampling/importance_sampling_ratio/min": 0.6626046299934387, "sampling/sampling_logp_difference/max": 0.4115767478942871, "sampling/sampling_logp_difference/mean": 0.01605891063809395, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 230.03125, "completions/mean_terminated_length": 230.03125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4080777168273926, "epoch": 1.5808823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.018053235471991634, "kl": 0.034843411296606064, "learning_rate": 5.423358173863116e-07, "loss": 0.0003, "num_tokens": 40765054.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352980852127075, "sampling/importance_sampling_ratio/mean": 1.0002480745315552, "sampling/importance_sampling_ratio/min": 0.654730498790741, "sampling/sampling_logp_difference/max": 0.4235316514968872, "sampling/sampling_logp_difference/mean": 0.01420527696609497, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 201.515625, "completions/mean_terminated_length": 201.515625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.33541762828826904, "epoch": 1.5821078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.01863793061088374, "kl": 0.035432569682598114, "learning_rate": 5.416259535891446e-07, "loss": 0.0003, "num_tokens": 40795071.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.43512761592865, "sampling/importance_sampling_ratio/mean": 0.999920129776001, "sampling/importance_sampling_ratio/min": 0.6430781483650208, "sampling/sampling_logp_difference/max": 0.44148898124694824, "sampling/sampling_logp_difference/mean": 0.012585221789777279, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 204.359375, "completions/mean_terminated_length": 204.359375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.43739044666290283, "epoch": 1.5833333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.02115004900477092, "kl": 0.0341816246509552, "learning_rate": 5.409160052939291e-07, "loss": 0.0003, "num_tokens": 40830054.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.751325011253357, "sampling/importance_sampling_ratio/mean": 0.9997460842132568, "sampling/importance_sampling_ratio/min": 0.7260932922363281, "sampling/sampling_logp_difference/max": 0.5603725910186768, "sampling/sampling_logp_difference/mean": 0.015032818540930748, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 153.359375, "completions/mean_terminated_length": 153.359375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3930108845233917, "epoch": 1.5845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.03688061085249617, "kl": 0.05595723167061806, "learning_rate": 5.402059739418148e-07, "loss": 0.0005, "num_tokens": 40856589.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.597269892692566, "sampling/importance_sampling_ratio/mean": 1.0001264810562134, "sampling/importance_sampling_ratio/min": 0.5229350328445435, "sampling/sampling_logp_difference/max": 0.6482980251312256, "sampling/sampling_logp_difference/mean": 0.015565967187285423, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 196.34375, "completions/mean_terminated_length": 196.34375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.35479289293289185, "epoch": 1.5857843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.017859915243605095, "kl": 0.034353796392679214, "learning_rate": 5.394958609741206e-07, "loss": 0.0003, "num_tokens": 40884963.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4333220720291138, "sampling/importance_sampling_ratio/mean": 0.9996464252471924, "sampling/importance_sampling_ratio/min": 0.6380343437194824, "sampling/sampling_logp_difference/max": 0.44936323165893555, "sampling/sampling_logp_difference/mean": 0.013390054926276207, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 265.265625, "completions/mean_terminated_length": 265.265625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.5733312368392944, "epoch": 1.5870098039215685, "frac_reward_zero_std": 0.25, "grad_norm": 1.0958078551567019, "kl": 0.06653326004743576, "learning_rate": 5.387856678323307e-07, "loss": 0.0205, "num_tokens": 40919972.0, "reward": -0.09375, "reward_std": 0.6802700161933899, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006455183029175, "sampling/importance_sampling_ratio/min": 0.6330235004425049, "sampling/sampling_logp_difference/max": 0.9251754283905029, "sampling/sampling_logp_difference/mean": 0.017458755522966385, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 171.21875, "completions/mean_terminated_length": 171.21875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4025220572948456, "epoch": 1.5882352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.019983676868800686, "kl": 0.04178793355822563, "learning_rate": 5.380753959580922e-07, "loss": 0.0003, "num_tokens": 40960546.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4041763544082642, "sampling/importance_sampling_ratio/mean": 0.9998598098754883, "sampling/importance_sampling_ratio/min": 0.7072054147720337, "sampling/sampling_logp_difference/max": 0.3464341163635254, "sampling/sampling_logp_difference/mean": 0.014135192148387432, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 246.9375, "completions/mean_terminated_length": 246.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5462402105331421, "epoch": 1.5894607843137254, "frac_reward_zero_std": 0.5, "grad_norm": 0.9179610903917748, "kl": 0.057419903576374054, "learning_rate": 5.373650467932121e-07, "loss": 0.0205, "num_tokens": 40994238.0, "reward": 0.59375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.358890414237976, "sampling/importance_sampling_ratio/mean": 0.9993711709976196, "sampling/importance_sampling_ratio/min": 0.7226272821426392, "sampling/sampling_logp_difference/max": 0.3248617649078369, "sampling/sampling_logp_difference/mean": 0.015900490805506706, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 231.1875, "completions/mean_terminated_length": 231.1875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.35127443075180054, "epoch": 1.590686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.020664295052024075, "kl": 0.028000975027680397, "learning_rate": 5.366546217796541e-07, "loss": 0.0003, "num_tokens": 41029946.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3271061182022095, "sampling/importance_sampling_ratio/mean": 1.0001511573791504, "sampling/importance_sampling_ratio/min": 0.7108889818191528, "sampling/sampling_logp_difference/max": 0.34123897552490234, "sampling/sampling_logp_difference/mean": 0.012899599969387054, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 214.03125, "completions/mean_terminated_length": 214.03125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.35238945484161377, "epoch": 1.5919117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.0824990246826658, "kl": 0.03601183369755745, "learning_rate": 5.359441223595363e-07, "loss": 0.0083, "num_tokens": 41063820.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.799363613128662, "sampling/importance_sampling_ratio/mean": 1.000187635421753, "sampling/importance_sampling_ratio/min": 0.7322774529457092, "sampling/sampling_logp_difference/max": 0.5874330997467041, "sampling/sampling_logp_difference/mean": 0.013019061647355556, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 199.21875, "completions/mean_terminated_length": 199.21875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3887077867984772, "epoch": 1.593137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.0711855415051406, "kl": 0.044857725501060486, "learning_rate": 5.352335499751269e-07, "loss": -0.0105, "num_tokens": 41092842.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4760971069335938, "sampling/importance_sampling_ratio/mean": 1.00020170211792, "sampling/importance_sampling_ratio/min": 0.6391717791557312, "sampling/sampling_logp_difference/max": 0.4475820064544678, "sampling/sampling_logp_difference/mean": 0.013103963807225227, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 249.46875, "completions/mean_terminated_length": 249.46875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.38398146629333496, "epoch": 1.594362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.7715913046425408, "kl": 0.0276858601719141, "learning_rate": 5.345229060688433e-07, "loss": 0.0218, "num_tokens": 41128344.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6073960065841675, "sampling/importance_sampling_ratio/mean": 1.000038981437683, "sampling/importance_sampling_ratio/min": 0.6056225895881653, "sampling/sampling_logp_difference/max": 0.5014982223510742, "sampling/sampling_logp_difference/mean": 0.012683748267591, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 214.515625, "completions/mean_terminated_length": 214.515625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.38036349415779114, "epoch": 1.5955882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.01941041468749474, "kl": 0.03759653866291046, "learning_rate": 5.338121920832475e-07, "loss": 0.0003, "num_tokens": 41159721.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4104421138763428, "sampling/importance_sampling_ratio/mean": 1.0005254745483398, "sampling/importance_sampling_ratio/min": 0.6971437931060791, "sampling/sampling_logp_difference/max": 0.3607635498046875, "sampling/sampling_logp_difference/mean": 0.014444278553128242, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 216.890625, "completions/mean_terminated_length": 216.890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4683217406272888, "epoch": 1.596813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7166611596871837, "kl": 0.06479225307703018, "learning_rate": 5.331014094610438e-07, "loss": -0.0064, "num_tokens": 41188850.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.454557180404663, "sampling/importance_sampling_ratio/mean": 0.999299168586731, "sampling/importance_sampling_ratio/min": 0.6173462271690369, "sampling/sampling_logp_difference/max": 0.48232531547546387, "sampling/sampling_logp_difference/mean": 0.015371318906545639, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 194.828125, "completions/mean_terminated_length": 194.828125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.36123284697532654, "epoch": 1.5980392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.022109711881336674, "kl": 0.04242018610239029, "learning_rate": 5.323905596450759e-07, "loss": 0.0004, "num_tokens": 41219111.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4179425239562988, "sampling/importance_sampling_ratio/mean": 0.9995553493499756, "sampling/importance_sampling_ratio/min": 0.5423824191093445, "sampling/sampling_logp_difference/max": 0.6117839813232422, "sampling/sampling_logp_difference/mean": 0.01411019079387188, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 203.65625, "completions/mean_terminated_length": 203.65625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.408869206905365, "epoch": 1.5992647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.018898870023638714, "kl": 0.03537284582853317, "learning_rate": 5.31679644078324e-07, "loss": 0.0004, "num_tokens": 41247793.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5288374423980713, "sampling/importance_sampling_ratio/mean": 0.9998724460601807, "sampling/importance_sampling_ratio/min": 0.6658873558044434, "sampling/sampling_logp_difference/max": 0.42450761795043945, "sampling/sampling_logp_difference/mean": 0.013627918437123299, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 205.40625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.39194366335868835, "epoch": 1.6004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7494485877542367, "kl": 0.044651098549366, "learning_rate": 5.309686642039015e-07, "loss": -0.0064, "num_tokens": 41276507.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4754210710525513, "sampling/importance_sampling_ratio/mean": 1.0001399517059326, "sampling/importance_sampling_ratio/min": 0.7034305334091187, "sampling/sampling_logp_difference/max": 0.3889434337615967, "sampling/sampling_logp_difference/mean": 0.013582364656031132, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 182.484375, "completions/mean_terminated_length": 182.484375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.35944098234176636, "epoch": 1.6017156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.018836484571554558, "kl": 0.037919431924819946, "learning_rate": 5.302576214650527e-07, "loss": 0.0003, "num_tokens": 41307194.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4416300058364868, "sampling/importance_sampling_ratio/mean": 0.9997807741165161, "sampling/importance_sampling_ratio/min": 0.6219866871833801, "sampling/sampling_logp_difference/max": 0.4748365879058838, "sampling/sampling_logp_difference/mean": 0.013919560238718987, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 191.484375, "completions/mean_terminated_length": 191.484375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3720022439956665, "epoch": 1.6029411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.7226069802139616, "kl": 0.04658423736691475, "learning_rate": 5.295465173051491e-07, "loss": 0.0121, "num_tokens": 41337849.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3114043474197388, "sampling/importance_sampling_ratio/mean": 0.9997433423995972, "sampling/importance_sampling_ratio/min": 0.6264688968658447, "sampling/sampling_logp_difference/max": 0.46765613555908203, "sampling/sampling_logp_difference/mean": 0.013154737651348114, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.45763248205184937, "epoch": 1.6041666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.7688062162783167, "kl": 0.05524931475520134, "learning_rate": 5.288353531676873e-07, "loss": 0.012, "num_tokens": 41368129.0, "reward": -0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": -0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.414183497428894, "sampling/importance_sampling_ratio/mean": 0.9998292922973633, "sampling/importance_sampling_ratio/min": 0.7510547041893005, "sampling/sampling_logp_difference/max": 0.34655237197875977, "sampling/sampling_logp_difference/mean": 0.014586166478693485, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 156.71875, "completions/mean_terminated_length": 156.71875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.28859561681747437, "epoch": 1.6053921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.02383512103423003, "kl": 0.04662370681762695, "learning_rate": 5.281241304962852e-07, "loss": 0.0004, "num_tokens": 41394447.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.565940022468567, "sampling/importance_sampling_ratio/mean": 1.000131607055664, "sampling/importance_sampling_ratio/min": 0.6829407811164856, "sampling/sampling_logp_difference/max": 0.448486328125, "sampling/sampling_logp_difference/mean": 0.01221737265586853, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 147.078125, "completions/mean_terminated_length": 147.078125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4415820837020874, "epoch": 1.6066176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.9719135272575133, "kl": 0.06833760440349579, "learning_rate": 5.2741285073468e-07, "loss": -0.0145, "num_tokens": 41430052.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6259056329727173, "sampling/importance_sampling_ratio/mean": 0.9997236728668213, "sampling/importance_sampling_ratio/min": 0.6579318046569824, "sampling/sampling_logp_difference/max": 0.4860649108886719, "sampling/sampling_logp_difference/mean": 0.016034625470638275, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 196.53125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4524112641811371, "epoch": 1.607843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.4441698959929314, "kl": 0.05132122337818146, "learning_rate": 5.267015153267245e-07, "loss": -0.0446, "num_tokens": 41462454.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6315703392028809, "sampling/importance_sampling_ratio/mean": 0.9993882179260254, "sampling/importance_sampling_ratio/min": 0.6724433898925781, "sampling/sampling_logp_difference/max": 0.48954296112060547, "sampling/sampling_logp_difference/mean": 0.015758004039525986, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 160.84375, "completions/mean_terminated_length": 160.84375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3407145142555237, "epoch": 1.6090686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.9106068174314087, "kl": 0.041749805212020874, "learning_rate": 5.259901257163844e-07, "loss": -0.0003, "num_tokens": 41489292.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6275004148483276, "sampling/importance_sampling_ratio/mean": 1.0000028610229492, "sampling/importance_sampling_ratio/min": 0.7483863234519958, "sampling/sampling_logp_difference/max": 0.4870452880859375, "sampling/sampling_logp_difference/mean": 0.012471785768866539, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.4330427348613739, "epoch": 1.6102941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0283704013003629, "kl": 0.048846468329429626, "learning_rate": 5.252786833477358e-07, "loss": 0.0005, "num_tokens": 41517879.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3068888187408447, "sampling/importance_sampling_ratio/mean": 1.0002931356430054, "sampling/importance_sampling_ratio/min": 0.6772646307945251, "sampling/sampling_logp_difference/max": 0.3896932601928711, "sampling/sampling_logp_difference/mean": 0.014600535854697227, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 245.0625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.39107346534729004, "epoch": 1.6115196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.6438024775641634, "kl": 0.028462667018175125, "learning_rate": 5.245671896649612e-07, "loss": -0.0296, "num_tokens": 41552587.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.449144721031189, "sampling/importance_sampling_ratio/mean": 0.9995109438896179, "sampling/importance_sampling_ratio/min": 0.4164716899394989, "sampling/sampling_logp_difference/max": 0.87593674659729, "sampling/sampling_logp_difference/mean": 0.012780715711414814, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 193.015625, "completions/mean_terminated_length": 193.015625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.419755756855011, "epoch": 1.6127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01988272897693721, "kl": 0.03518611937761307, "learning_rate": 5.23855646112348e-07, "loss": 0.0003, "num_tokens": 41580236.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4417973756790161, "sampling/importance_sampling_ratio/mean": 0.999925434589386, "sampling/importance_sampling_ratio/min": 0.7690115571022034, "sampling/sampling_logp_difference/max": 0.3658905029296875, "sampling/sampling_logp_difference/mean": 0.014504168182611465, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 201.046875, "completions/mean_terminated_length": 201.046875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.45324522256851196, "epoch": 1.6139705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.027969975864834386, "kl": 0.044220201671123505, "learning_rate": 5.231440541342845e-07, "loss": 0.0004, "num_tokens": 41608607.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3302301168441772, "sampling/importance_sampling_ratio/mean": 0.9996073246002197, "sampling/importance_sampling_ratio/min": 0.6945046782493591, "sampling/sampling_logp_difference/max": 0.36455631256103516, "sampling/sampling_logp_difference/mean": 0.015408453531563282, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 222.09375, "completions/mean_terminated_length": 222.09375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3381379246711731, "epoch": 1.6151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.016143746427933273, "kl": 0.03346278890967369, "learning_rate": 5.224324151752575e-07, "loss": 0.0003, "num_tokens": 41643205.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4749078750610352, "sampling/importance_sampling_ratio/mean": 0.9998982548713684, "sampling/importance_sampling_ratio/min": 0.7725921869277954, "sampling/sampling_logp_difference/max": 0.3885955810546875, "sampling/sampling_logp_difference/mean": 0.011889193207025528, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 216.828125, "completions/mean_terminated_length": 216.828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.46138995885849, "epoch": 1.616421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.3058688689762155, "kl": 0.0376376174390316, "learning_rate": 5.217207306798487e-07, "loss": -0.0307, "num_tokens": 41674202.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4060932397842407, "sampling/importance_sampling_ratio/mean": 0.9996814131736755, "sampling/importance_sampling_ratio/min": 0.5261827707290649, "sampling/sampling_logp_difference/max": 0.6421066522598267, "sampling/sampling_logp_difference/mean": 0.01577083021402359, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 179.171875, "completions/mean_terminated_length": 179.171875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.374620258808136, "epoch": 1.6176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.023526682651923623, "kl": 0.03740628436207771, "learning_rate": 5.210090020927326e-07, "loss": 0.0004, "num_tokens": 41703829.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4194133281707764, "sampling/importance_sampling_ratio/mean": 0.9996632933616638, "sampling/importance_sampling_ratio/min": 0.6773514151573181, "sampling/sampling_logp_difference/max": 0.389565110206604, "sampling/sampling_logp_difference/mean": 0.014365723356604576, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 187.84375, "completions/mean_terminated_length": 187.84375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4308626055717468, "epoch": 1.6188725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.021660701054459646, "kl": 0.039525438100099564, "learning_rate": 5.202972308586735e-07, "loss": 0.0004, "num_tokens": 41738619.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.607591986656189, "sampling/importance_sampling_ratio/mean": 1.0002162456512451, "sampling/importance_sampling_ratio/min": 0.7422322630882263, "sampling/sampling_logp_difference/max": 0.47473740577697754, "sampling/sampling_logp_difference/mean": 0.015454979613423347, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 173.640625, "completions/mean_terminated_length": 173.640625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.41735368967056274, "epoch": 1.6200980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.022611943103419407, "kl": 0.035894010215997696, "learning_rate": 5.195854184225213e-07, "loss": 0.0004, "num_tokens": 41768004.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2847273349761963, "sampling/importance_sampling_ratio/mean": 0.999764084815979, "sampling/importance_sampling_ratio/min": 0.7095668315887451, "sampling/sampling_logp_difference/max": 0.34310054779052734, "sampling/sampling_logp_difference/mean": 0.014909334480762482, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 186.296875, "completions/mean_terminated_length": 186.296875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.36383774876594543, "epoch": 1.6213235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.019477232921706708, "kl": 0.03461875766515732, "learning_rate": 5.188735662292107e-07, "loss": 0.0003, "num_tokens": 41795767.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.663252830505371, "sampling/importance_sampling_ratio/mean": 1.0004137754440308, "sampling/importance_sampling_ratio/min": 0.6158820986747742, "sampling/sampling_logp_difference/max": 0.5087752342224121, "sampling/sampling_logp_difference/mean": 0.014002447947859764, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 164.6875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.41262656450271606, "epoch": 1.6225490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.8563376392888821, "kl": 0.05176469683647156, "learning_rate": 5.181616757237561e-07, "loss": 0.0065, "num_tokens": 41821475.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4510161876678467, "sampling/importance_sampling_ratio/mean": 1.0002225637435913, "sampling/importance_sampling_ratio/min": 0.6574416756629944, "sampling/sampling_logp_difference/max": 0.4193992614746094, "sampling/sampling_logp_difference/mean": 0.015879232436418533, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 192.453125, "completions/mean_terminated_length": 192.453125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.4138108193874359, "epoch": 1.6237745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.030717015522757213, "kl": 0.06454174965620041, "learning_rate": 5.174497483512505e-07, "loss": 0.0007, "num_tokens": 41851264.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3186839818954468, "sampling/importance_sampling_ratio/mean": 1.0005762577056885, "sampling/importance_sampling_ratio/min": 0.6449248790740967, "sampling/sampling_logp_difference/max": 0.4386214017868042, "sampling/sampling_logp_difference/mean": 0.015171162784099579, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 204.359375, "completions/mean_terminated_length": 204.359375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3680081367492676, "epoch": 1.625, "frac_reward_zero_std": 0.75, "grad_norm": 1.0144647496447934, "kl": 0.04304346442222595, "learning_rate": 5.167377855568612e-07, "loss": 0.073, "num_tokens": 41885303.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5975630283355713, "sampling/importance_sampling_ratio/mean": 1.0000262260437012, "sampling/importance_sampling_ratio/min": 0.6099649667739868, "sampling/sampling_logp_difference/max": 0.4943537712097168, "sampling/sampling_logp_difference/mean": 0.01340695470571518, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 181.953125, "completions/mean_terminated_length": 181.953125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.4094926118850708, "epoch": 1.6262254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 1.0132436576303403, "kl": 0.038716308772563934, "learning_rate": 5.160257887858277e-07, "loss": 0.0322, "num_tokens": 41922756.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5303932428359985, "sampling/importance_sampling_ratio/mean": 1.0002155303955078, "sampling/importance_sampling_ratio/min": 0.7565470933914185, "sampling/sampling_logp_difference/max": 0.4255247116088867, "sampling/sampling_logp_difference/mean": 0.015106882899999619, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 165.234375, "completions/mean_terminated_length": 165.234375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.37199485301971436, "epoch": 1.6274509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02053875434826521, "kl": 0.04270268231630325, "learning_rate": 5.15313759483458e-07, "loss": 0.0004, "num_tokens": 41948563.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3621101379394531, "sampling/importance_sampling_ratio/mean": 0.9995244145393372, "sampling/importance_sampling_ratio/min": 0.6622359752655029, "sampling/sampling_logp_difference/max": 0.4121333360671997, "sampling/sampling_logp_difference/mean": 0.014357365667819977, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 168.140625, "completions/mean_terminated_length": 168.140625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5480635762214661, "epoch": 1.6286764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.8991249863607076, "kl": 0.06932510435581207, "learning_rate": 5.146016990951268e-07, "loss": -0.0296, "num_tokens": 41978252.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3300795555114746, "sampling/importance_sampling_ratio/mean": 0.9992871880531311, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.01712382212281227, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 181.640625, "completions/mean_terminated_length": 181.640625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3745878338813782, "epoch": 1.6299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8996782301834043, "kl": 0.04495788365602493, "learning_rate": 5.138896090662714e-07, "loss": 0.0197, "num_tokens": 42010021.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3376659154891968, "sampling/importance_sampling_ratio/mean": 1.0002796649932861, "sampling/importance_sampling_ratio/min": 0.6891587972640991, "sampling/sampling_logp_difference/max": 0.37228357791900635, "sampling/sampling_logp_difference/mean": 0.01386922039091587, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 163.671875, "completions/mean_terminated_length": 163.671875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4854891896247864, "epoch": 1.6311274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.8256542939191935, "kl": 0.060473497956991196, "learning_rate": 5.131774908423898e-07, "loss": 0.0016, "num_tokens": 42035952.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6098212003707886, "sampling/importance_sampling_ratio/mean": 1.0002360343933105, "sampling/importance_sampling_ratio/min": 0.6633803248405457, "sampling/sampling_logp_difference/max": 0.4761230945587158, "sampling/sampling_logp_difference/mean": 0.01714945212006569, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 164.828125, "completions/mean_terminated_length": 164.828125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3972048759460449, "epoch": 1.6323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.02213680524369021, "kl": 0.04135869815945625, "learning_rate": 5.124653458690365e-07, "loss": 0.0004, "num_tokens": 42064821.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6158479452133179, "sampling/importance_sampling_ratio/mean": 1.0002546310424805, "sampling/importance_sampling_ratio/min": 0.7061274647712708, "sampling/sampling_logp_difference/max": 0.4798598289489746, "sampling/sampling_logp_difference/mean": 0.0156618170440197, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 179.4375, "completions/mean_terminated_length": 179.4375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.37442547082901, "epoch": 1.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.02273687112248634, "kl": 0.03795505315065384, "learning_rate": 5.117531755918207e-07, "loss": 0.0004, "num_tokens": 42092401.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4187004566192627, "sampling/importance_sampling_ratio/mean": 1.000243902206421, "sampling/importance_sampling_ratio/min": 0.700717031955719, "sampling/sampling_logp_difference/max": 0.3556511402130127, "sampling/sampling_logp_difference/mean": 0.014409808441996574, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.40468767285346985, "epoch": 1.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.022758320385781947, "kl": 0.03673470765352249, "learning_rate": 5.110409814564031e-07, "loss": 0.0003, "num_tokens": 42128281.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.439674973487854, "sampling/importance_sampling_ratio/mean": 0.9996200799942017, "sampling/importance_sampling_ratio/min": 0.6193218231201172, "sampling/sampling_logp_difference/max": 0.47913026809692383, "sampling/sampling_logp_difference/mean": 0.014988134615123272, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 175.53125, "completions/mean_terminated_length": 175.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4532517194747925, "epoch": 1.6360294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.8069309298947697, "kl": 0.042012833058834076, "learning_rate": 5.103287649084926e-07, "loss": 0.0042, "num_tokens": 42156587.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.395279884338379, "sampling/importance_sampling_ratio/mean": 0.9997216463088989, "sampling/importance_sampling_ratio/min": 0.6385092735290527, "sampling/sampling_logp_difference/max": 0.44861912727355957, "sampling/sampling_logp_difference/mean": 0.015587395057082176, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 153.59375, "completions/mean_terminated_length": 153.59375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.33972054719924927, "epoch": 1.6372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.021105598295994576, "kl": 0.03662524372339249, "learning_rate": 5.096165273938435e-07, "loss": 0.0003, "num_tokens": 42183297.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3670557737350464, "sampling/importance_sampling_ratio/mean": 1.0000076293945312, "sampling/importance_sampling_ratio/min": 0.6033264994621277, "sampling/sampling_logp_difference/max": 0.5052967071533203, "sampling/sampling_logp_difference/mean": 0.013634255155920982, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 183.359375, "completions/mean_terminated_length": 183.359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4015965759754181, "epoch": 1.6384803921568627, "frac_reward_zero_std": 0.75, "grad_norm": 0.8156907350196695, "kl": 0.05624488741159439, "learning_rate": 5.089042703582533e-07, "loss": -0.0005, "num_tokens": 42212376.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4057984352111816, "sampling/importance_sampling_ratio/mean": 1.00014066696167, "sampling/importance_sampling_ratio/min": 0.6264901161193848, "sampling/sampling_logp_difference/max": 0.4676222801208496, "sampling/sampling_logp_difference/mean": 0.014927358366549015, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 198.09375, "completions/mean_terminated_length": 198.09375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.45356786251068115, "epoch": 1.6397058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.813557953406789, "kl": 0.03787259757518768, "learning_rate": 5.081919952475583e-07, "loss": -0.0005, "num_tokens": 42248446.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5267914533615112, "sampling/importance_sampling_ratio/mean": 1.0002291202545166, "sampling/importance_sampling_ratio/min": 0.41856861114501953, "sampling/sampling_logp_difference/max": 0.8709144592285156, "sampling/sampling_logp_difference/mean": 0.015977103263139725, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.40405213832855225, "epoch": 1.6409313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.022911835051461407, "kl": 0.040140967816114426, "learning_rate": 5.074797035076318e-07, "loss": 0.0004, "num_tokens": 42274950.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4439164400100708, "sampling/importance_sampling_ratio/mean": 0.9999382495880127, "sampling/importance_sampling_ratio/min": 0.6416292786598206, "sampling/sampling_logp_difference/max": 0.4437446594238281, "sampling/sampling_logp_difference/mean": 0.01622859016060829, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 191.609375, "completions/mean_terminated_length": 191.609375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.367656946182251, "epoch": 1.642156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.9209673126351376, "kl": 0.03389076888561249, "learning_rate": 5.067673965843812e-07, "loss": -0.0071, "num_tokens": 42303917.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.620739459991455, "sampling/importance_sampling_ratio/mean": 1.0002752542495728, "sampling/importance_sampling_ratio/min": 0.679011881351471, "sampling/sampling_logp_difference/max": 0.4828824996948242, "sampling/sampling_logp_difference/mean": 0.014965730719268322, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3058163821697235, "epoch": 1.6433823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.027913752649485868, "kl": 0.03194785118103027, "learning_rate": 5.060550759237441e-07, "loss": 0.0003, "num_tokens": 42334013.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6105678081512451, "sampling/importance_sampling_ratio/mean": 1.0004963874816895, "sampling/importance_sampling_ratio/min": 0.6547014117240906, "sampling/sampling_logp_difference/max": 0.47658681869506836, "sampling/sampling_logp_difference/mean": 0.012903638184070587, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 176.984375, "completions/mean_terminated_length": 176.984375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.36098748445510864, "epoch": 1.6446078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.018980144265438367, "kl": 0.026891544461250305, "learning_rate": 5.053427429716866e-07, "loss": 0.0003, "num_tokens": 42364940.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2840707302093506, "sampling/importance_sampling_ratio/mean": 0.9999522566795349, "sampling/importance_sampling_ratio/min": 0.49538055062294006, "sampling/sampling_logp_difference/max": 0.7024290561676025, "sampling/sampling_logp_difference/mean": 0.014355067163705826, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 188.796875, "completions/mean_terminated_length": 188.796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.36714956164360046, "epoch": 1.6458333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 1.0230623433289197, "kl": 0.03373267501592636, "learning_rate": 5.046303991741993e-07, "loss": 0.0171, "num_tokens": 42394495.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000599980354309, "sampling/importance_sampling_ratio/min": 0.6436612606048584, "sampling/sampling_logp_difference/max": 0.7042334079742432, "sampling/sampling_logp_difference/mean": 0.014708654955029488, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 210.96875, "completions/mean_terminated_length": 210.96875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.49650660157203674, "epoch": 1.6470588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.832054013171931, "kl": 0.05175505578517914, "learning_rate": 5.039180459772949e-07, "loss": -0.0276, "num_tokens": 42425821.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6660652160644531, "sampling/importance_sampling_ratio/mean": 0.9998347759246826, "sampling/importance_sampling_ratio/min": 0.7393490076065063, "sampling/sampling_logp_difference/max": 0.5104646682739258, "sampling/sampling_logp_difference/mean": 0.018163859844207764, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 126.96875, "completions/mean_terminated_length": 126.96875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.35790514945983887, "epoch": 1.6482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.050137352484596526, "kl": 0.053024642169475555, "learning_rate": 5.032056848270056e-07, "loss": 0.0005, "num_tokens": 42448571.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4242204427719116, "sampling/importance_sampling_ratio/mean": 1.0000522136688232, "sampling/importance_sampling_ratio/min": 0.6195096373558044, "sampling/sampling_logp_difference/max": 0.47882699966430664, "sampling/sampling_logp_difference/mean": 0.01624734327197075, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 163.859375, "completions/mean_terminated_length": 163.859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.310820996761322, "epoch": 1.6495098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.026732463611402052, "kl": 0.0334944874048233, "learning_rate": 5.02493317169379e-07, "loss": 0.0003, "num_tokens": 42473746.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.318579077720642, "sampling/importance_sampling_ratio/mean": 0.9997299313545227, "sampling/importance_sampling_ratio/min": 0.5049257874488831, "sampling/sampling_logp_difference/max": 0.6833438873291016, "sampling/sampling_logp_difference/mean": 0.013605006039142609, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 173.984375, "completions/mean_terminated_length": 173.984375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.35105830430984497, "epoch": 1.6507352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.025926797408567508, "kl": 0.03746583312749863, "learning_rate": 5.017809444504767e-07, "loss": 0.0004, "num_tokens": 42504097.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.507704734802246, "sampling/importance_sampling_ratio/mean": 0.9996305704116821, "sampling/importance_sampling_ratio/min": 0.5362950563430786, "sampling/sampling_logp_difference/max": 0.6230708360671997, "sampling/sampling_logp_difference/mean": 0.014887738972902298, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 210.390625, "completions/mean_terminated_length": 210.390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3304094076156616, "epoch": 1.6519607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.020346980029235143, "kl": 0.026339039206504822, "learning_rate": 5.010685681163698e-07, "loss": 0.0003, "num_tokens": 42537594.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5016820430755615, "sampling/importance_sampling_ratio/mean": 1.0007579326629639, "sampling/importance_sampling_ratio/min": 0.6771543622016907, "sampling/sampling_logp_difference/max": 0.40658581256866455, "sampling/sampling_logp_difference/mean": 0.012531211599707603, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 149.703125, "completions/mean_terminated_length": 149.703125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.394240140914917, "epoch": 1.653186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.03682560528055482, "kl": 0.03671746701002121, "learning_rate": 5.003561896131374e-07, "loss": 0.0004, "num_tokens": 42568231.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.401877522468567, "sampling/importance_sampling_ratio/mean": 1.0008456707000732, "sampling/importance_sampling_ratio/min": 0.6810407638549805, "sampling/sampling_logp_difference/max": 0.38413310050964355, "sampling/sampling_logp_difference/mean": 0.015878338366746902, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 177.546875, "completions/mean_terminated_length": 177.546875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.35006511211395264, "epoch": 1.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.02562559019198169, "kl": 0.027174564078450203, "learning_rate": 4.996438103868625e-07, "loss": 0.0003, "num_tokens": 42598442.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3723374605178833, "sampling/importance_sampling_ratio/mean": 1.0002684593200684, "sampling/importance_sampling_ratio/min": 0.7244256138801575, "sampling/sampling_logp_difference/max": 0.3223762512207031, "sampling/sampling_logp_difference/mean": 0.013350119814276695, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 197.5625, "completions/mean_terminated_length": 197.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3624252378940582, "epoch": 1.655637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6550030764097664, "kl": 0.038706012070178986, "learning_rate": 4.989314318836302e-07, "loss": 0.0107, "num_tokens": 42627646.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5278007984161377, "sampling/importance_sampling_ratio/mean": 0.9998214840888977, "sampling/importance_sampling_ratio/min": 0.6209051012992859, "sampling/sampling_logp_difference/max": 0.4765770435333252, "sampling/sampling_logp_difference/mean": 0.01568310149013996, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 145.078125, "completions/mean_terminated_length": 145.078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.35354289412498474, "epoch": 1.656862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 1.0223500839934487, "kl": 0.04581998288631439, "learning_rate": 4.982190555495235e-07, "loss": 0.0, "num_tokens": 42650467.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.54673433303833, "sampling/importance_sampling_ratio/mean": 1.0001983642578125, "sampling/importance_sampling_ratio/min": 0.632348358631134, "sampling/sampling_logp_difference/max": 0.4583148956298828, "sampling/sampling_logp_difference/mean": 0.01473325490951538, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 162.265625, "completions/mean_terminated_length": 162.265625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.27945151925086975, "epoch": 1.6580882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.031127494861176195, "kl": 0.027711743488907814, "learning_rate": 4.975066828306209e-07, "loss": 0.0003, "num_tokens": 42676436.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6095050573349, "sampling/importance_sampling_ratio/mean": 1.0007215738296509, "sampling/importance_sampling_ratio/min": 0.632074773311615, "sampling/sampling_logp_difference/max": 0.47592663764953613, "sampling/sampling_logp_difference/mean": 0.012972263619303703, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 161.734375, "completions/mean_terminated_length": 161.734375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.29871469736099243, "epoch": 1.659313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.10170850044489836, "kl": 0.04948166757822037, "learning_rate": 4.967943151729944e-07, "loss": 0.0005, "num_tokens": 42701203.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4357198476791382, "sampling/importance_sampling_ratio/mean": 1.000241756439209, "sampling/importance_sampling_ratio/min": 0.6487295627593994, "sampling/sampling_logp_difference/max": 0.43273937702178955, "sampling/sampling_logp_difference/mean": 0.014093228615820408, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 148.046875, "completions/mean_terminated_length": 148.046875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.2653568983078003, "epoch": 1.6605392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8555905687961668, "kl": 0.028595855459570885, "learning_rate": 4.96081954022705e-07, "loss": -0.0465, "num_tokens": 42725318.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.508622169494629, "sampling/importance_sampling_ratio/mean": 0.9999791979789734, "sampling/importance_sampling_ratio/min": 0.6793181896209717, "sampling/sampling_logp_difference/max": 0.4111967086791992, "sampling/sampling_logp_difference/mean": 0.012172574177384377, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 162.734375, "completions/mean_terminated_length": 162.734375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3512003421783447, "epoch": 1.6617647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.9904731342825035, "kl": 0.05167574808001518, "learning_rate": 4.953696008258008e-07, "loss": 0.0539, "num_tokens": 42750789.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4245202541351318, "sampling/importance_sampling_ratio/mean": 1.0001580715179443, "sampling/importance_sampling_ratio/min": 0.7033551335334778, "sampling/sampling_logp_difference/max": 0.3538351058959961, "sampling/sampling_logp_difference/mean": 0.01564112678170204, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 141.546875, "completions/mean_terminated_length": 141.546875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.36807769536972046, "epoch": 1.6629901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.06052259875942137, "kl": 0.05655297264456749, "learning_rate": 4.946572570283134e-07, "loss": 0.0005, "num_tokens": 42776232.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5521773099899292, "sampling/importance_sampling_ratio/mean": 0.9987242221832275, "sampling/importance_sampling_ratio/min": 0.6245699524879456, "sampling/sampling_logp_difference/max": 0.4706919193267822, "sampling/sampling_logp_difference/mean": 0.017455413937568665, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 161.265625, "completions/mean_terminated_length": 161.265625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3822873532772064, "epoch": 1.6642156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.8381209088210724, "kl": 0.05772514268755913, "learning_rate": 4.939449240762558e-07, "loss": 0.0241, "num_tokens": 42803305.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3750104904174805, "sampling/importance_sampling_ratio/mean": 1.000871181488037, "sampling/importance_sampling_ratio/min": 0.6172491908073425, "sampling/sampling_logp_difference/max": 0.4824824333190918, "sampling/sampling_logp_difference/mean": 0.01507329661399126, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 152.421875, "completions/mean_terminated_length": 152.421875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3023286461830139, "epoch": 1.6654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.035516933832001996, "kl": 0.03554411977529526, "learning_rate": 4.932326034156189e-07, "loss": 0.0003, "num_tokens": 42832164.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4886304140090942, "sampling/importance_sampling_ratio/mean": 1.000510334968567, "sampling/importance_sampling_ratio/min": 0.37227627635002136, "sampling/sampling_logp_difference/max": 0.9881191253662109, "sampling/sampling_logp_difference/mean": 0.013887686654925346, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 156.859375, "completions/mean_terminated_length": 156.859375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.25837552547454834, "epoch": 1.6666666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.03076134315505385, "kl": 0.03431840240955353, "learning_rate": 4.925202964923683e-07, "loss": 0.0003, "num_tokens": 42856763.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7259713411331177, "sampling/importance_sampling_ratio/mean": 1.001096248626709, "sampling/importance_sampling_ratio/min": 0.6386511325836182, "sampling/sampling_logp_difference/max": 0.5457899570465088, "sampling/sampling_logp_difference/mean": 0.012383817695081234, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 171.484375, "completions/mean_terminated_length": 171.484375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3070116639137268, "epoch": 1.6678921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0349437000416922, "kl": 0.03264414519071579, "learning_rate": 4.918080047524417e-07, "loss": 0.0003, "num_tokens": 42882778.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5539450645446777, "sampling/importance_sampling_ratio/mean": 1.0001025199890137, "sampling/importance_sampling_ratio/min": 0.7177270650863647, "sampling/sampling_logp_difference/max": 0.4407968521118164, "sampling/sampling_logp_difference/mean": 0.01374280359596014, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.34527677297592163, "epoch": 1.6691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.08364935046106986, "kl": 0.06286908686161041, "learning_rate": 4.910957296417467e-07, "loss": 0.0006, "num_tokens": 42903158.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4781303405761719, "sampling/importance_sampling_ratio/mean": 1.0003399848937988, "sampling/importance_sampling_ratio/min": 0.5820320248603821, "sampling/sampling_logp_difference/max": 0.5412298440933228, "sampling/sampling_logp_difference/mean": 0.016989272087812424, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 200.21875, "completions/mean_terminated_length": 200.21875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.396662175655365, "epoch": 1.670343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8406850862689759, "kl": 0.05830325931310654, "learning_rate": 4.903834726061564e-07, "loss": 0.0501, "num_tokens": 42937284.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3477368354797363, "sampling/importance_sampling_ratio/mean": 1.0003334283828735, "sampling/importance_sampling_ratio/min": 0.6876862645149231, "sampling/sampling_logp_difference/max": 0.374422550201416, "sampling/sampling_logp_difference/mean": 0.015416143462061882, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 187.96875, "completions/mean_terminated_length": 187.96875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.36734241247177124, "epoch": 1.6715686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.03472094724967518, "kl": 0.03748122230172157, "learning_rate": 4.896712350915074e-07, "loss": 0.0004, "num_tokens": 42975026.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8012819290161133, "sampling/importance_sampling_ratio/mean": 1.0005528926849365, "sampling/importance_sampling_ratio/min": 0.6894164085388184, "sampling/sampling_logp_difference/max": 0.588498592376709, "sampling/sampling_logp_difference/mean": 0.014278611168265343, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 172.921875, "completions/mean_terminated_length": 172.921875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.27913814783096313, "epoch": 1.6727941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.7883877875965949, "kl": 0.040285248309373856, "learning_rate": 4.889590185435969e-07, "loss": -0.0038, "num_tokens": 43005197.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.48959219455719, "sampling/importance_sampling_ratio/mean": 0.9998310208320618, "sampling/importance_sampling_ratio/min": 0.6181638836860657, "sampling/sampling_logp_difference/max": 0.481001615524292, "sampling/sampling_logp_difference/mean": 0.012834897264838219, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.5182405710220337, "epoch": 1.6740196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.1479215998376797, "kl": 0.06698828935623169, "learning_rate": 4.882468244081792e-07, "loss": 0.0193, "num_tokens": 43039953.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.487452745437622, "sampling/importance_sampling_ratio/mean": 0.9995392560958862, "sampling/importance_sampling_ratio/min": 0.6213628053665161, "sampling/sampling_logp_difference/max": 0.47584009170532227, "sampling/sampling_logp_difference/mean": 0.01910814270377159, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 210.171875, "completions/mean_terminated_length": 210.171875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.2507001459598541, "epoch": 1.6752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02484434768268435, "kl": 0.025661654770374298, "learning_rate": 4.875346541309636e-07, "loss": 0.0002, "num_tokens": 43071852.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5987184047698975, "sampling/importance_sampling_ratio/mean": 1.000409483909607, "sampling/importance_sampling_ratio/min": 0.6142013669013977, "sampling/sampling_logp_difference/max": 0.48743247985839844, "sampling/sampling_logp_difference/mean": 0.011608692817389965, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 205.109375, "completions/mean_terminated_length": 205.109375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.36860013008117676, "epoch": 1.6764705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.942880839334349, "kl": 0.054536305367946625, "learning_rate": 4.868225091576102e-07, "loss": -0.0101, "num_tokens": 43103187.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3728216886520386, "sampling/importance_sampling_ratio/mean": 1.0003598928451538, "sampling/importance_sampling_ratio/min": 0.6970885992050171, "sampling/sampling_logp_difference/max": 0.3608427047729492, "sampling/sampling_logp_difference/mean": 0.01367130409926176, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 146.859375, "completions/mean_terminated_length": 146.859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3172333240509033, "epoch": 1.6776960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.067249806327941, "kl": 0.0470583513379097, "learning_rate": 4.861103909337285e-07, "loss": 0.0005, "num_tokens": 43131114.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.491654872894287, "sampling/importance_sampling_ratio/mean": 0.9995778799057007, "sampling/importance_sampling_ratio/min": 0.48319175839424133, "sampling/sampling_logp_difference/max": 0.7273416519165039, "sampling/sampling_logp_difference/mean": 0.015247553586959839, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 148.46875, "completions/mean_terminated_length": 148.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3860611021518707, "epoch": 1.678921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.0156624165494141, "kl": 0.04295976832509041, "learning_rate": 4.853983009048732e-07, "loss": 0.0092, "num_tokens": 43161064.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.9391186237335205, "sampling/importance_sampling_ratio/mean": 1.0002822875976562, "sampling/importance_sampling_ratio/min": 0.6218687891960144, "sampling/sampling_logp_difference/max": 0.6622335910797119, "sampling/sampling_logp_difference/mean": 0.016975287348031998, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 162.328125, "completions/mean_terminated_length": 162.328125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.45282042026519775, "epoch": 1.6801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.6421957872690638, "kl": 0.06159253418445587, "learning_rate": 4.84686240516542e-07, "loss": 0.0161, "num_tokens": 43188173.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6155743598937988, "sampling/importance_sampling_ratio/mean": 0.9999287128448486, "sampling/importance_sampling_ratio/min": 0.7227627038955688, "sampling/sampling_logp_difference/max": 0.4796905517578125, "sampling/sampling_logp_difference/mean": 0.01766914874315262, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 170.671875, "completions/mean_terminated_length": 170.671875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.28264009952545166, "epoch": 1.6813725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.033847431817872406, "kl": 0.026995070278644562, "learning_rate": 4.839742112141724e-07, "loss": 0.0003, "num_tokens": 43216984.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6612284183502197, "sampling/importance_sampling_ratio/mean": 1.0005111694335938, "sampling/importance_sampling_ratio/min": 0.7724306583404541, "sampling/sampling_logp_difference/max": 0.5075573921203613, "sampling/sampling_logp_difference/mean": 0.012486159801483154, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 153.453125, "completions/mean_terminated_length": 153.453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3153635859489441, "epoch": 1.6825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.035374205418365584, "kl": 0.02962607890367508, "learning_rate": 4.832622144431388e-07, "loss": 0.0003, "num_tokens": 43245573.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5984976291656494, "sampling/importance_sampling_ratio/mean": 0.9990425109863281, "sampling/importance_sampling_ratio/min": 0.6164947152137756, "sampling/sampling_logp_difference/max": 0.4837055206298828, "sampling/sampling_logp_difference/mean": 0.014347722753882408, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 192.71875, "completions/mean_terminated_length": 192.71875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.40542715787887573, "epoch": 1.6838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.02828922354908403, "kl": 0.03110560029745102, "learning_rate": 4.825502516487496e-07, "loss": 0.0003, "num_tokens": 43277875.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4067785739898682, "sampling/importance_sampling_ratio/mean": 0.9995977878570557, "sampling/importance_sampling_ratio/min": 0.6156638860702515, "sampling/sampling_logp_difference/max": 0.48505401611328125, "sampling/sampling_logp_difference/mean": 0.01650063879787922, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 198.046875, "completions/mean_terminated_length": 198.046875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.425992488861084, "epoch": 1.6850490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.03407945002471821, "kl": 0.043031804263591766, "learning_rate": 4.818383242762439e-07, "loss": 0.0004, "num_tokens": 43315366.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6155577898025513, "sampling/importance_sampling_ratio/mean": 1.0000079870224, "sampling/importance_sampling_ratio/min": 0.6148726940155029, "sampling/sampling_logp_difference/max": 0.4863399863243103, "sampling/sampling_logp_difference/mean": 0.01564926654100418, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 202.140625, "completions/mean_terminated_length": 202.140625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.37396860122680664, "epoch": 1.6862745098039216, "frac_reward_zero_std": 0.5, "grad_norm": 1.2579806973084713, "kl": 0.03807292878627777, "learning_rate": 4.811264337707894e-07, "loss": -0.0198, "num_tokens": 43343487.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7321727275848389, "sampling/importance_sampling_ratio/mean": 0.9996993541717529, "sampling/importance_sampling_ratio/min": 0.6481713652610779, "sampling/sampling_logp_difference/max": 0.5493764877319336, "sampling/sampling_logp_difference/mean": 0.013977523893117905, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 163.796875, "completions/mean_terminated_length": 163.796875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3286457061767578, "epoch": 1.6875, "frac_reward_zero_std": 1.0, "grad_norm": 0.028985537231318456, "kl": 0.03047393634915352, "learning_rate": 4.804145815774786e-07, "loss": 0.0003, "num_tokens": 43373970.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4068397283554077, "sampling/importance_sampling_ratio/mean": 1.0006426572799683, "sampling/importance_sampling_ratio/min": 0.6209167838096619, "sampling/sampling_logp_difference/max": 0.47655820846557617, "sampling/sampling_logp_difference/mean": 0.014818085357546806, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 212.890625, "completions/mean_terminated_length": 212.890625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.29821670055389404, "epoch": 1.6887254901960784, "frac_reward_zero_std": 1.0, "grad_norm": 0.03384399312511881, "kl": 0.027891390025615692, "learning_rate": 4.797027691413267e-07, "loss": 0.0003, "num_tokens": 43402059.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.459165334701538, "sampling/importance_sampling_ratio/mean": 1.000119924545288, "sampling/importance_sampling_ratio/min": 0.624224841594696, "sampling/sampling_logp_difference/max": 0.4712446928024292, "sampling/sampling_logp_difference/mean": 0.01404886320233345, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.36395466327667236, "epoch": 1.6899509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.03126411478502101, "kl": 0.03434750437736511, "learning_rate": 4.789909979072673e-07, "loss": 0.0003, "num_tokens": 43436975.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.693570852279663, "sampling/importance_sampling_ratio/mean": 1.0003466606140137, "sampling/importance_sampling_ratio/min": 0.643545389175415, "sampling/sampling_logp_difference/max": 0.5268392562866211, "sampling/sampling_logp_difference/mean": 0.014872172847390175, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 176.59375, "completions/mean_terminated_length": 176.59375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3995271325111389, "epoch": 1.6911764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02820292967795783, "kl": 0.03533459082245827, "learning_rate": 4.782792693201513e-07, "loss": 0.0003, "num_tokens": 43465701.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.663543939590454, "sampling/importance_sampling_ratio/mean": 0.9997713565826416, "sampling/importance_sampling_ratio/min": 0.5771487951278687, "sampling/sampling_logp_difference/max": 0.5496551990509033, "sampling/sampling_logp_difference/mean": 0.016274357214570045, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 175.9375, "completions/mean_terminated_length": 175.9375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3463321328163147, "epoch": 1.6924019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.026718688673992796, "kl": 0.03018825501203537, "learning_rate": 4.775675848247427e-07, "loss": 0.0003, "num_tokens": 43495425.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3740402460098267, "sampling/importance_sampling_ratio/mean": 1.0006554126739502, "sampling/importance_sampling_ratio/min": 0.6716427206993103, "sampling/sampling_logp_difference/max": 0.39802873134613037, "sampling/sampling_logp_difference/mean": 0.014302469789981842, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 168.390625, "completions/mean_terminated_length": 168.390625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.33125466108322144, "epoch": 1.6936274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.0846404275939687, "kl": 0.048324376344680786, "learning_rate": 4.768559458657155e-07, "loss": -0.0115, "num_tokens": 43521210.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.7379939556121826, "sampling/importance_sampling_ratio/mean": 0.9997254610061646, "sampling/importance_sampling_ratio/min": 0.6437204480171204, "sampling/sampling_logp_difference/max": 0.5527315139770508, "sampling/sampling_logp_difference/mean": 0.015096197836101055, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.33917608857154846, "epoch": 1.6948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.0286757095713985, "kl": 0.04030965641140938, "learning_rate": 4.7614435388765203e-07, "loss": 0.0118, "num_tokens": 43558202.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4172824621200562, "sampling/importance_sampling_ratio/mean": 1.000497817993164, "sampling/importance_sampling_ratio/min": 0.6317176818847656, "sampling/sampling_logp_difference/max": 0.45931267738342285, "sampling/sampling_logp_difference/mean": 0.014552392065525055, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 160.6875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3247225880622864, "epoch": 1.696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.03208284249245446, "kl": 0.03661260008811951, "learning_rate": 4.7543281033503885e-07, "loss": 0.0003, "num_tokens": 43587446.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5082286596298218, "sampling/importance_sampling_ratio/mean": 0.9997931122779846, "sampling/importance_sampling_ratio/min": 0.6297659277915955, "sampling/sampling_logp_difference/max": 0.46240711212158203, "sampling/sampling_logp_difference/mean": 0.015131104737520218, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 229.3125, "completions/mean_terminated_length": 229.3125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3170344829559326, "epoch": 1.6973039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02952600609357999, "kl": 0.02648162469267845, "learning_rate": 4.747213166522644e-07, "loss": 0.0003, "num_tokens": 43620186.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4534229040145874, "sampling/importance_sampling_ratio/mean": 1.0002045631408691, "sampling/importance_sampling_ratio/min": 0.5138672590255737, "sampling/sampling_logp_difference/max": 0.665790319442749, "sampling/sampling_logp_difference/mean": 0.013356797397136688, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 174.703125, "completions/mean_terminated_length": 174.703125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.34353625774383545, "epoch": 1.6985294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.030580744517143644, "kl": 0.029760949313640594, "learning_rate": 4.740098742836156e-07, "loss": 0.0003, "num_tokens": 43645447.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3198062181472778, "sampling/importance_sampling_ratio/mean": 1.0000088214874268, "sampling/importance_sampling_ratio/min": 0.6282213926315308, "sampling/sampling_logp_difference/max": 0.464862585067749, "sampling/sampling_logp_difference/mean": 0.014398678205907345, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.37205174565315247, "epoch": 1.6997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.7543692809234275, "kl": 0.042431462556123734, "learning_rate": 4.732984846732755e-07, "loss": -0.0045, "num_tokens": 43677231.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5407931804656982, "sampling/importance_sampling_ratio/mean": 1.000165581703186, "sampling/importance_sampling_ratio/min": 0.627170741558075, "sampling/sampling_logp_difference/max": 0.4665365219116211, "sampling/sampling_logp_difference/mean": 0.015096995048224926, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 205.28125, "completions/mean_terminated_length": 205.28125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2494359165430069, "epoch": 1.7009803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.017768629845976415, "kl": 0.020800625905394554, "learning_rate": 4.725871492653199e-07, "loss": 0.0002, "num_tokens": 43708065.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4053384065628052, "sampling/importance_sampling_ratio/mean": 1.0005443096160889, "sampling/importance_sampling_ratio/min": 0.4785323441028595, "sampling/sampling_logp_difference/max": 0.7370314598083496, "sampling/sampling_logp_difference/mean": 0.011222817935049534, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 182.171875, "completions/mean_terminated_length": 182.171875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.30386781692504883, "epoch": 1.7022058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.031091348803848076, "kl": 0.02845875173807144, "learning_rate": 4.718758695037149e-07, "loss": 0.0003, "num_tokens": 43735580.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.575518012046814, "sampling/importance_sampling_ratio/mean": 1.000417947769165, "sampling/importance_sampling_ratio/min": 0.717453122138977, "sampling/sampling_logp_difference/max": 0.45458412170410156, "sampling/sampling_logp_difference/mean": 0.013841615989804268, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 215.328125, "completions/mean_terminated_length": 215.328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3812558054924011, "epoch": 1.7034313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.9062121487325195, "kl": 0.03396788612008095, "learning_rate": 4.7116464683231285e-07, "loss": -0.0197, "num_tokens": 43772001.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6544450521469116, "sampling/importance_sampling_ratio/mean": 1.0001673698425293, "sampling/importance_sampling_ratio/min": 0.705818772315979, "sampling/sampling_logp_difference/max": 0.5034656524658203, "sampling/sampling_logp_difference/mean": 0.015301933512091637, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 208.28125, "completions/mean_terminated_length": 208.28125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.32804757356643677, "epoch": 1.704656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.017757286513298963, "kl": 0.025670060887932777, "learning_rate": 4.704534826948509e-07, "loss": 0.0002, "num_tokens": 43806579.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998500347137451, "sampling/importance_sampling_ratio/min": 0.620639443397522, "sampling/sampling_logp_difference/max": 0.809590220451355, "sampling/sampling_logp_difference/mean": 0.01399231143295765, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 176.328125, "completions/mean_terminated_length": 176.328125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.27521198987960815, "epoch": 1.7058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.02977600678266604, "kl": 0.03567678481340408, "learning_rate": 4.6974237853494744e-07, "loss": 0.0003, "num_tokens": 43836936.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3147550821304321, "sampling/importance_sampling_ratio/mean": 0.9992107152938843, "sampling/importance_sampling_ratio/min": 0.6593252420425415, "sampling/sampling_logp_difference/max": 0.4165383577346802, "sampling/sampling_logp_difference/mean": 0.013566594570875168, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 206.359375, "completions/mean_terminated_length": 206.359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3251534700393677, "epoch": 1.7071078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.020232729503715307, "kl": 0.02814178541302681, "learning_rate": 4.690313357960985e-07, "loss": 0.0003, "num_tokens": 43872319.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.581284761428833, "sampling/importance_sampling_ratio/mean": 0.999963104724884, "sampling/importance_sampling_ratio/min": 0.6158772706985474, "sampling/sampling_logp_difference/max": 0.4847075939178467, "sampling/sampling_logp_difference/mean": 0.0141825620085001, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 184.140625, "completions/mean_terminated_length": 184.140625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.41460761427879333, "epoch": 1.7083333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.022121295935993986, "kl": 0.028628060594201088, "learning_rate": 4.68320355921676e-07, "loss": 0.0003, "num_tokens": 43901976.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4436148405075073, "sampling/importance_sampling_ratio/mean": 0.9999883770942688, "sampling/importance_sampling_ratio/min": 0.6721982359886169, "sampling/sampling_logp_difference/max": 0.3972020149230957, "sampling/sampling_logp_difference/mean": 0.01625676453113556, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 256.0625, "completions/mean_terminated_length": 256.0625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3149504065513611, "epoch": 1.7095588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.023421245005584058, "kl": 0.031986355781555176, "learning_rate": 4.67609440354924e-07, "loss": 0.0003, "num_tokens": 43939308.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744267702102661, "sampling/importance_sampling_ratio/mean": 0.9998037815093994, "sampling/importance_sampling_ratio/min": 0.3797297179698944, "sampling/sampling_logp_difference/max": 0.9682955741882324, "sampling/sampling_logp_difference/mean": 0.013182253576815128, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 246.578125, "completions/mean_terminated_length": 246.578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.48535847663879395, "epoch": 1.7107843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 0.9976000775411583, "kl": 0.0374327227473259, "learning_rate": 4.668985905389563e-07, "loss": -0.0363, "num_tokens": 43977809.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4041378498077393, "sampling/importance_sampling_ratio/mean": 1.0008046627044678, "sampling/importance_sampling_ratio/min": 0.6904107332229614, "sampling/sampling_logp_difference/max": 0.3704686164855957, "sampling/sampling_logp_difference/mean": 0.01654989831149578, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 224.859375, "completions/mean_terminated_length": 224.859375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.38829389214515686, "epoch": 1.7120098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.02345620649634188, "kl": 0.02460196614265442, "learning_rate": 4.661878079167526e-07, "loss": 0.0002, "num_tokens": 44015032.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6182984113693237, "sampling/importance_sampling_ratio/mean": 0.9992150068283081, "sampling/importance_sampling_ratio/min": 0.42074137926101685, "sampling/sampling_logp_difference/max": 0.8657369613647461, "sampling/sampling_logp_difference/mean": 0.015943851321935654, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 172.375, "completions/mean_terminated_length": 172.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2884783148765564, "epoch": 1.7132352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.024885221888654482, "kl": 0.024858683347702026, "learning_rate": 4.6547709393115677e-07, "loss": 0.0003, "num_tokens": 44041216.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3691984415054321, "sampling/importance_sampling_ratio/mean": 1.000344157218933, "sampling/importance_sampling_ratio/min": 0.6060724258422852, "sampling/sampling_logp_difference/max": 0.500755786895752, "sampling/sampling_logp_difference/mean": 0.013669838197529316, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 206.5625, "completions/mean_terminated_length": 206.5625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3687763512134552, "epoch": 1.7144607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.016980934105717046, "kl": 0.021134931594133377, "learning_rate": 4.6476645002487295e-07, "loss": 0.0002, "num_tokens": 44076788.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.511935830116272, "sampling/importance_sampling_ratio/mean": 1.0003235340118408, "sampling/importance_sampling_ratio/min": 0.6470901966094971, "sampling/sampling_logp_difference/max": 0.4352695941925049, "sampling/sampling_logp_difference/mean": 0.014946680516004562, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 212.703125, "completions/mean_terminated_length": 212.703125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3485764265060425, "epoch": 1.715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.016084272629488215, "kl": 0.022283107042312622, "learning_rate": 4.640558776404639e-07, "loss": 0.0002, "num_tokens": 44113233.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.633171558380127, "sampling/importance_sampling_ratio/mean": 0.9996302127838135, "sampling/importance_sampling_ratio/min": 0.5327483415603638, "sampling/sampling_logp_difference/max": 0.6297061443328857, "sampling/sampling_logp_difference/mean": 0.01467338390648365, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 192.1875, "completions/mean_terminated_length": 192.1875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3813598155975342, "epoch": 1.7169117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.02383066227556483, "kl": 0.030649978667497635, "learning_rate": 4.633453782203458e-07, "loss": 0.0003, "num_tokens": 44140093.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 1.0003618001937866, "sampling/importance_sampling_ratio/min": 0.6177483201026917, "sampling/sampling_logp_difference/max": 0.4816741943359375, "sampling/sampling_logp_difference/mean": 0.015897084027528763, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 211.15625, "completions/mean_terminated_length": 211.15625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4610140323638916, "epoch": 1.718137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.086954599687401, "kl": 0.05906593054533005, "learning_rate": 4.626349532067879e-07, "loss": -0.0141, "num_tokens": 44171975.0, "reward": 0.65625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5080199241638184, "sampling/importance_sampling_ratio/mean": 1.0002214908599854, "sampling/importance_sampling_ratio/min": 0.7183539867401123, "sampling/sampling_logp_difference/max": 0.41079747676849365, "sampling/sampling_logp_difference/mean": 0.017302053049206734, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 162.765625, "completions/mean_terminated_length": 162.765625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.25882458686828613, "epoch": 1.719362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.025924261751398732, "kl": 0.027556277811527252, "learning_rate": 4.6192460404190793e-07, "loss": 0.0003, "num_tokens": 44199656.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6110548973083496, "sampling/importance_sampling_ratio/mean": 0.9999560117721558, "sampling/importance_sampling_ratio/min": 0.631548285484314, "sampling/sampling_logp_difference/max": 0.47688913345336914, "sampling/sampling_logp_difference/mean": 0.012271011248230934, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 217.8125, "completions/mean_terminated_length": 217.8125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.39787495136260986, "epoch": 1.7205882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.7389451406692283, "kl": 0.04811471700668335, "learning_rate": 4.6121433216766935e-07, "loss": 0.0022, "num_tokens": 44232796.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4469801187515259, "sampling/importance_sampling_ratio/mean": 0.9995118379592896, "sampling/importance_sampling_ratio/min": 0.6057416200637817, "sampling/sampling_logp_difference/max": 0.5013017654418945, "sampling/sampling_logp_difference/mean": 0.016226064413785934, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 237.046875, "completions/mean_terminated_length": 237.046875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3010721802711487, "epoch": 1.721813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.012379549934276401, "kl": 0.018051059916615486, "learning_rate": 4.605041390258794e-07, "loss": 0.0002, "num_tokens": 44265871.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4354110956192017, "sampling/importance_sampling_ratio/mean": 1.0000429153442383, "sampling/importance_sampling_ratio/min": 0.6600005030632019, "sampling/sampling_logp_difference/max": 0.4155147075653076, "sampling/sampling_logp_difference/mean": 0.012983039021492004, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 224.84375, "completions/mean_terminated_length": 224.84375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3763827681541443, "epoch": 1.7230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8051493766718345, "kl": 0.03543316572904587, "learning_rate": 4.5979402605818514e-07, "loss": 0.0311, "num_tokens": 44299413.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5788308382034302, "sampling/importance_sampling_ratio/mean": 0.9999749660491943, "sampling/importance_sampling_ratio/min": 0.6668434143066406, "sampling/sampling_logp_difference/max": 0.45668458938598633, "sampling/sampling_logp_difference/mean": 0.014188934117555618, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 181.046875, "completions/mean_terminated_length": 181.046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3947075307369232, "epoch": 1.7242647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0450270633718066, "kl": 0.03533687815070152, "learning_rate": 4.5908399470607104e-07, "loss": 0.0003, "num_tokens": 44327464.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.600734829902649, "sampling/importance_sampling_ratio/mean": 0.9991427063941956, "sampling/importance_sampling_ratio/min": 0.5616891384124756, "sampling/sampling_logp_difference/max": 0.5768067836761475, "sampling/sampling_logp_difference/mean": 0.016672521829605103, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 239.109375, "completions/mean_terminated_length": 239.109375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5134605765342712, "epoch": 1.7254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7564047078937739, "kl": 0.03194734454154968, "learning_rate": 4.5837404641085535e-07, "loss": 0.0078, "num_tokens": 44369247.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5701671838760376, "sampling/importance_sampling_ratio/mean": 0.9999627470970154, "sampling/importance_sampling_ratio/min": 0.6398777961730957, "sampling/sampling_logp_difference/max": 0.45118212699890137, "sampling/sampling_logp_difference/mean": 0.01758524775505066, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 211.09375, "completions/mean_terminated_length": 211.09375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.30869802832603455, "epoch": 1.7267156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.013130937363080112, "kl": 0.019469216465950012, "learning_rate": 4.576641826136884e-07, "loss": 0.0002, "num_tokens": 44401269.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4367069005966187, "sampling/importance_sampling_ratio/mean": 1.0001226663589478, "sampling/importance_sampling_ratio/min": 0.512790858745575, "sampling/sampling_logp_difference/max": 0.6678872108459473, "sampling/sampling_logp_difference/mean": 0.012972285971045494, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 178.90625, "completions/mean_terminated_length": 178.90625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.37025976181030273, "epoch": 1.7279411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.9805768712694561, "kl": 0.0347997285425663, "learning_rate": 4.5695440475554864e-07, "loss": -0.0126, "num_tokens": 44429775.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0007288455963135, "sampling/importance_sampling_ratio/min": 0.637222945690155, "sampling/sampling_logp_difference/max": 0.4506356716156006, "sampling/sampling_logp_difference/mean": 0.01550462655723095, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 237.78125, "completions/mean_terminated_length": 237.78125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.43542617559432983, "epoch": 1.7291666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.8403159944524539, "kl": 0.04189896583557129, "learning_rate": 4.5624471427724036e-07, "loss": 0.001, "num_tokens": 44458977.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.428770661354065, "sampling/importance_sampling_ratio/mean": 0.9994067549705505, "sampling/importance_sampling_ratio/min": 0.4924968481063843, "sampling/sampling_logp_difference/max": 0.7082672119140625, "sampling/sampling_logp_difference/mean": 0.016688797622919083, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 210.328125, "completions/mean_terminated_length": 210.328125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3765452802181244, "epoch": 1.7303921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.020520693560359245, "kl": 0.029866300523281097, "learning_rate": 4.5553511261939e-07, "loss": 0.0003, "num_tokens": 44491766.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5136628150939941, "sampling/importance_sampling_ratio/mean": 0.999915599822998, "sampling/importance_sampling_ratio/min": 0.6295127868652344, "sampling/sampling_logp_difference/max": 0.46280908584594727, "sampling/sampling_logp_difference/mean": 0.014515403658151627, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 197.859375, "completions/mean_terminated_length": 197.859375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3265450596809387, "epoch": 1.7316176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.02309653591519408, "kl": 0.030018430203199387, "learning_rate": 4.5482560122244407e-07, "loss": 0.0003, "num_tokens": 44518845.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.600726842880249, "sampling/importance_sampling_ratio/mean": 1.0002501010894775, "sampling/importance_sampling_ratio/min": 0.6692793965339661, "sampling/sampling_logp_difference/max": 0.4704577922821045, "sampling/sampling_logp_difference/mean": 0.014397569000720978, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 280.9375, "completions/mean_terminated_length": 280.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3380570113658905, "epoch": 1.732843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.6330278044158965, "kl": 0.022636758163571358, "learning_rate": 4.541161815266658e-07, "loss": 0.023, "num_tokens": 44554841.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.875036358833313, "sampling/importance_sampling_ratio/mean": 0.9998974204063416, "sampling/importance_sampling_ratio/min": 0.6907591223716736, "sampling/sampling_logp_difference/max": 0.6286280155181885, "sampling/sampling_logp_difference/mean": 0.0126770855858922, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.36764997243881226, "epoch": 1.7340686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 1.0385506943948144, "kl": 0.02965870499610901, "learning_rate": 4.534068549721324e-07, "loss": -0.0239, "num_tokens": 44582017.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.388352870941162, "sampling/importance_sampling_ratio/mean": 1.0002903938293457, "sampling/importance_sampling_ratio/min": 0.6088035106658936, "sampling/sampling_logp_difference/max": 0.4962596893310547, "sampling/sampling_logp_difference/mean": 0.0152654517441988, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 211.328125, "completions/mean_terminated_length": 211.328125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.474894642829895, "epoch": 1.7352941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.0246964759475967, "kl": 0.03367047384381294, "learning_rate": 4.5269762299873144e-07, "loss": -0.0211, "num_tokens": 44616950.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.396409034729004, "sampling/importance_sampling_ratio/mean": 0.9997467398643494, "sampling/importance_sampling_ratio/min": 0.6573418378829956, "sampling/sampling_logp_difference/max": 0.41955113410949707, "sampling/sampling_logp_difference/mean": 0.017158398404717445, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 232.859375, "completions/mean_terminated_length": 232.859375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.42667698860168457, "epoch": 1.7365196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.6762500874465751, "kl": 0.04127083718776703, "learning_rate": 4.519884870461591e-07, "loss": -0.0073, "num_tokens": 44651453.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.7663496732711792, "sampling/importance_sampling_ratio/mean": 0.9996098279953003, "sampling/importance_sampling_ratio/min": 0.6298543214797974, "sampling/sampling_logp_difference/max": 0.5689151287078857, "sampling/sampling_logp_difference/mean": 0.016091158613562584, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 166.46875, "completions/mean_terminated_length": 166.46875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3405911922454834, "epoch": 1.7377450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.123686574209456, "kl": 0.04614399001002312, "learning_rate": 4.512794485539165e-07, "loss": -0.024, "num_tokens": 44675787.0, "reward": 0.625, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.381252408027649, "sampling/importance_sampling_ratio/mean": 0.999666690826416, "sampling/importance_sampling_ratio/min": 0.6393997073173523, "sampling/sampling_logp_difference/max": 0.44722557067871094, "sampling/sampling_logp_difference/mean": 0.01435903925448656, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 201.390625, "completions/mean_terminated_length": 201.390625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3742409348487854, "epoch": 1.7389705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 0.7028407306720601, "kl": 0.02892223931849003, "learning_rate": 4.505705089613068e-07, "loss": -0.0251, "num_tokens": 44705412.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5027868747711182, "sampling/importance_sampling_ratio/mean": 1.0004254579544067, "sampling/importance_sampling_ratio/min": 0.6805267333984375, "sampling/sampling_logp_difference/max": 0.40732133388519287, "sampling/sampling_logp_difference/mean": 0.014049705117940903, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 177.609375, "completions/mean_terminated_length": 177.609375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.39864909648895264, "epoch": 1.7401960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 1.017394567395279, "kl": 0.0507853738963604, "learning_rate": 4.4986166970743233e-07, "loss": 0.0066, "num_tokens": 44730987.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5071645975112915, "sampling/importance_sampling_ratio/mean": 1.0001134872436523, "sampling/importance_sampling_ratio/min": 0.6924469470977783, "sampling/sampling_logp_difference/max": 0.4102301597595215, "sampling/sampling_logp_difference/mean": 0.015454644337296486, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 195.578125, "completions/mean_terminated_length": 195.578125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3866695463657379, "epoch": 1.741421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.8594666401532325, "kl": 0.029470369219779968, "learning_rate": 4.4915293223119205e-07, "loss": 0.0295, "num_tokens": 44759536.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4187947511672974, "sampling/importance_sampling_ratio/mean": 1.0001866817474365, "sampling/importance_sampling_ratio/min": 0.7082564830780029, "sampling/sampling_logp_difference/max": 0.3498077392578125, "sampling/sampling_logp_difference/mean": 0.015881307423114777, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 219.546875, "completions/mean_terminated_length": 219.546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.5283569097518921, "epoch": 1.7426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.8813196512801895, "kl": 0.04730577394366264, "learning_rate": 4.484442979712783e-07, "loss": -0.0017, "num_tokens": 44796531.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4558353424072266, "sampling/importance_sampling_ratio/mean": 1.0001245737075806, "sampling/importance_sampling_ratio/min": 0.6180657744407654, "sampling/sampling_logp_difference/max": 0.48116040229797363, "sampling/sampling_logp_difference/mean": 0.01736762933433056, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 152.03125, "completions/mean_terminated_length": 152.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3932775855064392, "epoch": 1.7438725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.9309149058833824, "kl": 0.06567516922950745, "learning_rate": 4.477357683661733e-07, "loss": -0.0048, "num_tokens": 44820533.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5347106456756592, "sampling/importance_sampling_ratio/mean": 1.000542402267456, "sampling/importance_sampling_ratio/min": 0.4917859435081482, "sampling/sampling_logp_difference/max": 0.7097117900848389, "sampling/sampling_logp_difference/mean": 0.015332138165831566, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 211.03125, "completions/mean_terminated_length": 211.03125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4433809518814087, "epoch": 1.7450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7713792047364969, "kl": 0.05297395586967468, "learning_rate": 4.470273448541475e-07, "loss": 0.0, "num_tokens": 44848423.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.9679169654846191, "sampling/importance_sampling_ratio/mean": 0.9995995759963989, "sampling/importance_sampling_ratio/min": 0.6510018110275269, "sampling/sampling_logp_difference/max": 0.6769756078720093, "sampling/sampling_logp_difference/mean": 0.01568237505853176, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 230.109375, "completions/mean_terminated_length": 230.109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.44413939118385315, "epoch": 1.7463235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.865555639371449, "kl": 0.04160609841346741, "learning_rate": 4.4631902887325567e-07, "loss": 0.0115, "num_tokens": 44885422.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6075966358184814, "sampling/importance_sampling_ratio/mean": 0.9996904134750366, "sampling/importance_sampling_ratio/min": 0.6300875544548035, "sampling/sampling_logp_difference/max": 0.47474026679992676, "sampling/sampling_logp_difference/mean": 0.015817858278751373, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 199.265625, "completions/mean_terminated_length": 199.265625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3973761796951294, "epoch": 1.7475490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.882389261024687, "kl": 0.06518774479627609, "learning_rate": 4.4561082186133456e-07, "loss": -0.0007, "num_tokens": 44910751.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4429669380187988, "sampling/importance_sampling_ratio/mean": 0.9999887347221375, "sampling/importance_sampling_ratio/min": 0.7577892541885376, "sampling/sampling_logp_difference/max": 0.3667013645172119, "sampling/sampling_logp_difference/mean": 0.014969083480536938, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4147040545940399, "epoch": 1.7487745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.9431794753665337, "kl": 0.026446394622325897, "learning_rate": 4.4490272525599936e-07, "loss": 0.0189, "num_tokens": 44943079.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5475490093231201, "sampling/importance_sampling_ratio/mean": 1.0002816915512085, "sampling/importance_sampling_ratio/min": 0.6187689304351807, "sampling/sampling_logp_difference/max": 0.4800233840942383, "sampling/sampling_logp_difference/mean": 0.01538955420255661, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 183.9375, "completions/mean_terminated_length": 183.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4222148060798645, "epoch": 1.75, "frac_reward_zero_std": 0.75, "grad_norm": 0.9239683154757388, "kl": 0.0388445146381855, "learning_rate": 4.4419474049464135e-07, "loss": 0.0091, "num_tokens": 44969763.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.291072130203247, "sampling/importance_sampling_ratio/mean": 1.0004539489746094, "sampling/importance_sampling_ratio/min": 0.7533888816833496, "sampling/sampling_logp_difference/max": 0.2831737995147705, "sampling/sampling_logp_difference/mean": 0.014346310868859291, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 213.484375, "completions/mean_terminated_length": 213.484375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.37551143765449524, "epoch": 1.7512254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.7062470331062345, "kl": 0.04760557413101196, "learning_rate": 4.43486869014425e-07, "loss": 0.0071, "num_tokens": 45004482.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4682234525680542, "sampling/importance_sampling_ratio/mean": 0.999937891960144, "sampling/importance_sampling_ratio/min": 0.6740283370018005, "sampling/sampling_logp_difference/max": 0.39448320865631104, "sampling/sampling_logp_difference/mean": 0.0134231336414814, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 238.234375, "completions/mean_terminated_length": 238.234375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.38191598653793335, "epoch": 1.7524509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.02758437666281902, "kl": 0.034000493586063385, "learning_rate": 4.427791122522841e-07, "loss": 0.0003, "num_tokens": 45046049.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4964890480041504, "sampling/importance_sampling_ratio/mean": 1.0001420974731445, "sampling/importance_sampling_ratio/min": 0.6302828788757324, "sampling/sampling_logp_difference/max": 0.46158647537231445, "sampling/sampling_logp_difference/mean": 0.014439761638641357, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 235.859375, "completions/mean_terminated_length": 235.859375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3631054162979126, "epoch": 1.7536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.7424446221644718, "kl": 0.039311766624450684, "learning_rate": 4.420714716449203e-07, "loss": -0.0052, "num_tokens": 45078072.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4538235664367676, "sampling/importance_sampling_ratio/mean": 0.9998072385787964, "sampling/importance_sampling_ratio/min": 0.6587311625480652, "sampling/sampling_logp_difference/max": 0.4174398183822632, "sampling/sampling_logp_difference/mean": 0.01360815018415451, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 223.140625, "completions/mean_terminated_length": 223.140625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3967345952987671, "epoch": 1.7549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.5323812009704646, "kl": 0.033599622547626495, "learning_rate": 4.413639486287991e-07, "loss": -0.018, "num_tokens": 45110897.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6360160112380981, "sampling/importance_sampling_ratio/mean": 1.0001624822616577, "sampling/importance_sampling_ratio/min": 0.7722803950309753, "sampling/sampling_logp_difference/max": 0.4922640323638916, "sampling/sampling_logp_difference/mean": 0.013910435140132904, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 202.84375, "completions/mean_terminated_length": 202.84375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.41037964820861816, "epoch": 1.7561274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.019878875723460726, "kl": 0.03348096087574959, "learning_rate": 4.406565446401476e-07, "loss": 0.0003, "num_tokens": 45139751.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5786607265472412, "sampling/importance_sampling_ratio/mean": 1.000056266784668, "sampling/importance_sampling_ratio/min": 0.7140791416168213, "sampling/sampling_logp_difference/max": 0.4565768241882324, "sampling/sampling_logp_difference/mean": 0.015522721223533154, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 184.078125, "completions/mean_terminated_length": 184.078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4791666865348816, "epoch": 1.7573529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 1.127227683632228, "kl": 0.05700379237532616, "learning_rate": 4.399492611149509e-07, "loss": 0.0358, "num_tokens": 45168332.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4940515756607056, "sampling/importance_sampling_ratio/mean": 1.000679850578308, "sampling/importance_sampling_ratio/min": 0.7372725605964661, "sampling/sampling_logp_difference/max": 0.401491641998291, "sampling/sampling_logp_difference/mean": 0.017366910353302956, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5041054487228394, "epoch": 1.758578431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7247880014361506, "kl": 0.041422732174396515, "learning_rate": 4.392420994889498e-07, "loss": -0.0423, "num_tokens": 45199676.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5277611017227173, "sampling/importance_sampling_ratio/mean": 1.0000765323638916, "sampling/importance_sampling_ratio/min": 0.705386757850647, "sampling/sampling_logp_difference/max": 0.42380332946777344, "sampling/sampling_logp_difference/mean": 0.017310116440057755, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 191.578125, "completions/mean_terminated_length": 191.578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.40415358543395996, "epoch": 1.7598039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.024389198589871545, "kl": 0.03791702538728714, "learning_rate": 4.385350611976376e-07, "loss": 0.0003, "num_tokens": 45227505.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3794703483581543, "sampling/importance_sampling_ratio/mean": 1.0002727508544922, "sampling/importance_sampling_ratio/min": 0.6700911521911621, "sampling/sampling_logp_difference/max": 0.4003415107727051, "sampling/sampling_logp_difference/mean": 0.01557602733373642, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 197.6875, "completions/mean_terminated_length": 197.6875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.48839443922042847, "epoch": 1.7610294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.747341235613586, "kl": 0.03958131745457649, "learning_rate": 4.3782814767625755e-07, "loss": -0.0077, "num_tokens": 45258205.0, "reward": -0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4346733093261719, "sampling/importance_sampling_ratio/mean": 1.0000269412994385, "sampling/importance_sampling_ratio/min": 0.6903257966041565, "sampling/sampling_logp_difference/max": 0.3705916404724121, "sampling/sampling_logp_difference/mean": 0.01604308933019638, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 248.734375, "completions/mean_terminated_length": 248.734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.46708595752716064, "epoch": 1.7622549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.7063798639318799, "kl": 0.04332081228494644, "learning_rate": 4.371213603597987e-07, "loss": -0.0109, "num_tokens": 45290860.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3585596084594727, "sampling/importance_sampling_ratio/mean": 1.0002461671829224, "sampling/importance_sampling_ratio/min": 0.5722141861915588, "sampling/sampling_logp_difference/max": 0.5582419633865356, "sampling/sampling_logp_difference/mean": 0.015267834067344666, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 237.03125, "completions/mean_terminated_length": 237.03125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.40694278478622437, "epoch": 1.7634803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.01716887672229882, "kl": 0.028053196147084236, "learning_rate": 4.3641470068299483e-07, "loss": 0.0003, "num_tokens": 45331198.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.313656210899353, "sampling/importance_sampling_ratio/mean": 1.0002212524414062, "sampling/importance_sampling_ratio/min": 0.6681939363479614, "sampling/sampling_logp_difference/max": 0.40317678451538086, "sampling/sampling_logp_difference/mean": 0.01535987388342619, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 249.90625, "completions/mean_terminated_length": 249.90625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.46517300605773926, "epoch": 1.7647058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0968058975377974, "kl": 0.03315602242946625, "learning_rate": 4.3570817008032044e-07, "loss": 0.0297, "num_tokens": 45364568.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.408729910850525, "sampling/importance_sampling_ratio/mean": 0.9997270703315735, "sampling/importance_sampling_ratio/min": 0.6681921482086182, "sampling/sampling_logp_difference/max": 0.4031795263290405, "sampling/sampling_logp_difference/mean": 0.013626371510326862, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 148.9375, "completions/mean_terminated_length": 148.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3735087811946869, "epoch": 1.7659313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 1.0202814849083852, "kl": 0.061778899282217026, "learning_rate": 4.350017699859877e-07, "loss": 0.0152, "num_tokens": 45386980.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.535566806793213, "sampling/importance_sampling_ratio/mean": 1.0005061626434326, "sampling/importance_sampling_ratio/min": 0.6316543221473694, "sampling/sampling_logp_difference/max": 0.4594130516052246, "sampling/sampling_logp_difference/mean": 0.01528320275247097, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.46313539147377014, "epoch": 1.767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.8291787052513004, "kl": 0.04051543399691582, "learning_rate": 4.342955018339441e-07, "loss": 0.0094, "num_tokens": 45419580.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2963625192642212, "sampling/importance_sampling_ratio/mean": 1.0001630783081055, "sampling/importance_sampling_ratio/min": 0.6908029317855835, "sampling/sampling_logp_difference/max": 0.3699007034301758, "sampling/sampling_logp_difference/mean": 0.015424519777297974, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.47829362750053406, "epoch": 1.7683823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 0.9674432569519643, "kl": 0.04246748983860016, "learning_rate": 4.335893670578694e-07, "loss": 0.0047, "num_tokens": 45459844.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5277862548828125, "sampling/importance_sampling_ratio/mean": 1.000131368637085, "sampling/importance_sampling_ratio/min": 0.6370444893836975, "sampling/sampling_logp_difference/max": 0.4509158134460449, "sampling/sampling_logp_difference/mean": 0.015260925516486168, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 164.84375, "completions/mean_terminated_length": 164.84375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3611081838607788, "epoch": 1.7696078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0393042253065224, "kl": 0.0389854833483696, "learning_rate": 4.328833670911724e-07, "loss": 0.0004, "num_tokens": 45485514.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5949219465255737, "sampling/importance_sampling_ratio/mean": 0.9993205666542053, "sampling/importance_sampling_ratio/min": 0.5590474605560303, "sampling/sampling_logp_difference/max": 0.5815209746360779, "sampling/sampling_logp_difference/mean": 0.014122438617050648, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 186.59375, "completions/mean_terminated_length": 186.59375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4266851842403412, "epoch": 1.7708333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.9178166907920076, "kl": 0.05023089796304703, "learning_rate": 4.3217750336698803e-07, "loss": 0.0046, "num_tokens": 45511424.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997497200965881, "sampling/importance_sampling_ratio/min": 0.6100702881813049, "sampling/sampling_logp_difference/max": 0.7019822597503662, "sampling/sampling_logp_difference/mean": 0.015094645321369171, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 176.640625, "completions/mean_terminated_length": 176.640625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4793596863746643, "epoch": 1.7720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.026665708128201734, "kl": 0.04186504706740379, "learning_rate": 4.314717773181752e-07, "loss": 0.0004, "num_tokens": 45541577.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3670605421066284, "sampling/importance_sampling_ratio/mean": 1.000087022781372, "sampling/importance_sampling_ratio/min": 0.6345877647399902, "sampling/sampling_logp_difference/max": 0.45477962493896484, "sampling/sampling_logp_difference/mean": 0.016608726233243942, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 186.953125, "completions/mean_terminated_length": 186.953125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4314137101173401, "epoch": 1.7732843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.023656792639340025, "kl": 0.040345869958400726, "learning_rate": 4.3076619037731287e-07, "loss": 0.0004, "num_tokens": 45569238.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3493237495422363, "sampling/importance_sampling_ratio/mean": 1.0000889301300049, "sampling/importance_sampling_ratio/min": 0.5987197756767273, "sampling/sampling_logp_difference/max": 0.5129616260528564, "sampling/sampling_logp_difference/mean": 0.016253970563411713, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 230.609375, "completions/mean_terminated_length": 230.609375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.45809489488601685, "epoch": 1.7745098039215685, "frac_reward_zero_std": 0.5, "grad_norm": 1.1351322335383285, "kl": 0.031431425362825394, "learning_rate": 4.3006074397669836e-07, "loss": -0.0083, "num_tokens": 45604077.0, "reward": 0.78125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.284456491470337, "sampling/importance_sampling_ratio/mean": 1.000319480895996, "sampling/importance_sampling_ratio/min": 0.6482195258140564, "sampling/sampling_logp_difference/max": 0.43352580070495605, "sampling/sampling_logp_difference/mean": 0.01463567465543747, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 264.46875, "completions/mean_terminated_length": 264.46875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4789225161075592, "epoch": 1.7757352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.9112477610484371, "kl": 0.04142525792121887, "learning_rate": 4.293554395483425e-07, "loss": -0.0005, "num_tokens": 45644155.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6009043455123901, "sampling/importance_sampling_ratio/mean": 0.9999097585678101, "sampling/importance_sampling_ratio/min": 0.7041060924530029, "sampling/sampling_logp_difference/max": 0.4705686569213867, "sampling/sampling_logp_difference/mean": 0.015574609860777855, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 258.890625, "completions/mean_terminated_length": 258.890625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.43155670166015625, "epoch": 1.7769607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.01368765642754883, "kl": 0.024679599329829216, "learning_rate": 4.2865027852396894e-07, "loss": 0.0002, "num_tokens": 45680788.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5347754955291748, "sampling/importance_sampling_ratio/mean": 1.0001623630523682, "sampling/importance_sampling_ratio/min": 0.7059873938560486, "sampling/sampling_logp_difference/max": 0.42838406562805176, "sampling/sampling_logp_difference/mean": 0.014761995524168015, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 172.328125, "completions/mean_terminated_length": 172.328125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.42772746086120605, "epoch": 1.778186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.02520302182925972, "kl": 0.04123090207576752, "learning_rate": 4.2794526233501004e-07, "loss": 0.0004, "num_tokens": 45706457.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.52882719039917, "sampling/importance_sampling_ratio/mean": 0.9998812079429626, "sampling/importance_sampling_ratio/min": 0.6311986446380615, "sampling/sampling_logp_difference/max": 0.46013474464416504, "sampling/sampling_logp_difference/mean": 0.015842996537685394, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 236.421875, "completions/mean_terminated_length": 236.421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5397396683692932, "epoch": 1.7794117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.9810596928979152, "kl": 0.07520410418510437, "learning_rate": 4.272403924126035e-07, "loss": 0.0173, "num_tokens": 45739348.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5650054216384888, "sampling/importance_sampling_ratio/mean": 1.0001192092895508, "sampling/importance_sampling_ratio/min": 0.6933867931365967, "sampling/sampling_logp_difference/max": 0.4478893280029297, "sampling/sampling_logp_difference/mean": 0.017063235864043236, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 233.09375, "completions/mean_terminated_length": 233.09375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.41079872846603394, "epoch": 1.780637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6442710194208022, "kl": 0.02720138430595398, "learning_rate": 4.2653567018759103e-07, "loss": -0.0061, "num_tokens": 45775434.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5081772804260254, "sampling/importance_sampling_ratio/mean": 1.0005946159362793, "sampling/importance_sampling_ratio/min": 0.5385057926177979, "sampling/sampling_logp_difference/max": 0.6189570426940918, "sampling/sampling_logp_difference/mean": 0.01502861175686121, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 233.296875, "completions/mean_terminated_length": 233.296875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.40316036343574524, "epoch": 1.781862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.6642370122603454, "kl": 0.03852131590247154, "learning_rate": 4.258310970905139e-07, "loss": -0.0014, "num_tokens": 45810909.0, "reward": -0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5914466381072998, "sampling/importance_sampling_ratio/mean": 0.9998533725738525, "sampling/importance_sampling_ratio/min": 0.6117665767669678, "sampling/sampling_logp_difference/max": 0.49140453338623047, "sampling/sampling_logp_difference/mean": 0.01392968650907278, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 240.9375, "completions/mean_terminated_length": 240.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.46248531341552734, "epoch": 1.7830882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.017064247998052156, "kl": 0.030550135299563408, "learning_rate": 4.251266745516112e-07, "loss": 0.0003, "num_tokens": 45849609.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5256630182266235, "sampling/importance_sampling_ratio/mean": 0.9996311664581299, "sampling/importance_sampling_ratio/min": 0.6598731875419617, "sampling/sampling_logp_difference/max": 0.42242908477783203, "sampling/sampling_logp_difference/mean": 0.016041986644268036, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 192.34375, "completions/mean_terminated_length": 192.34375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.4016135334968567, "epoch": 1.784313725490196, "frac_reward_zero_std": 0.75, "grad_norm": 1.019064933887383, "kl": 0.03971964493393898, "learning_rate": 4.2442240400081556e-07, "loss": -0.0196, "num_tokens": 45881071.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5750302076339722, "sampling/importance_sampling_ratio/mean": 0.9997595548629761, "sampling/importance_sampling_ratio/min": 0.4159521460533142, "sampling/sampling_logp_difference/max": 0.8771851062774658, "sampling/sampling_logp_difference/mean": 0.014799212105572224, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 227.890625, "completions/mean_terminated_length": 227.890625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.35006678104400635, "epoch": 1.7855392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.710172264158936, "kl": 0.02738107740879059, "learning_rate": 4.2371828686775186e-07, "loss": 0.0069, "num_tokens": 45916424.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.650227427482605, "sampling/importance_sampling_ratio/mean": 1.0000296831130981, "sampling/importance_sampling_ratio/min": 0.6437652707099915, "sampling/sampling_logp_difference/max": 0.500913143157959, "sampling/sampling_logp_difference/mean": 0.012949788942933083, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 200.609375, "completions/mean_terminated_length": 200.609375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4181138277053833, "epoch": 1.7867647058823528, "frac_reward_zero_std": 0.5, "grad_norm": 1.3334042675386486, "kl": 0.05847199261188507, "learning_rate": 4.2301432458173316e-07, "loss": -0.0091, "num_tokens": 45941807.0, "reward": -0.21875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": -0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.508720874786377, "sampling/importance_sampling_ratio/mean": 1.0001122951507568, "sampling/importance_sampling_ratio/min": 0.6050756573677063, "sampling/sampling_logp_difference/max": 0.5024018287658691, "sampling/sampling_logp_difference/mean": 0.016681130975484848, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 181.015625, "completions/mean_terminated_length": 181.015625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.46196916699409485, "epoch": 1.7879901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.1725318917114993, "kl": 0.07137738913297653, "learning_rate": 4.223105185717585e-07, "loss": -0.0018, "num_tokens": 45968352.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4689947366714478, "sampling/importance_sampling_ratio/mean": 0.9999860525131226, "sampling/importance_sampling_ratio/min": 0.7149273157119751, "sampling/sampling_logp_difference/max": 0.38457822799682617, "sampling/sampling_logp_difference/mean": 0.017508184537291527, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 203.6875, "completions/mean_terminated_length": 203.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3804335594177246, "epoch": 1.7892156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.019368482660348265, "kl": 0.027461178600788116, "learning_rate": 4.216068702665093e-07, "loss": 0.0003, "num_tokens": 45998796.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4015306234359741, "sampling/importance_sampling_ratio/mean": 1.0002665519714355, "sampling/importance_sampling_ratio/min": 0.6131548881530762, "sampling/sampling_logp_difference/max": 0.4891376495361328, "sampling/sampling_logp_difference/mean": 0.015096787363290787, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 235.234375, "completions/mean_terminated_length": 235.234375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.37548473477363586, "epoch": 1.7904411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.015921481218471225, "kl": 0.025881817564368248, "learning_rate": 4.2090338109434703e-07, "loss": 0.0003, "num_tokens": 46035595.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6298270225524902, "sampling/importance_sampling_ratio/mean": 0.9997875690460205, "sampling/importance_sampling_ratio/min": 0.6951361894607544, "sampling/sampling_logp_difference/max": 0.48847389221191406, "sampling/sampling_logp_difference/mean": 0.014340518973767757, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 207.984375, "completions/mean_terminated_length": 207.984375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3004228472709656, "epoch": 1.7916666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.013667664901205656, "kl": 0.019026055932044983, "learning_rate": 4.202000524833105e-07, "loss": 0.0002, "num_tokens": 46069626.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.0003504753112793, "sampling/importance_sampling_ratio/min": 0.6548967957496643, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.01225491613149643, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 226.90625, "completions/mean_terminated_length": 226.90625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.39118072390556335, "epoch": 1.7928921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.016486895443134506, "kl": 0.032455939799547195, "learning_rate": 4.194968858611117e-07, "loss": 0.0003, "num_tokens": 46102884.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0000404119491577, "sampling/importance_sampling_ratio/min": 0.6678564548492432, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.013750225305557251, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 250.859375, "completions/mean_terminated_length": 250.859375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.43674230575561523, "epoch": 1.7941176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.8923833983931393, "kl": 0.03069932386279106, "learning_rate": 4.187938826551346e-07, "loss": 0.0301, "num_tokens": 46146075.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 0.9999037981033325, "sampling/importance_sampling_ratio/min": 0.3847516179084778, "sampling/sampling_logp_difference/max": 0.9551572799682617, "sampling/sampling_logp_difference/mean": 0.015772782266139984, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 160.953125, "completions/mean_terminated_length": 160.953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3915073275566101, "epoch": 1.795343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.0581596796611143, "kl": 0.04968298599123955, "learning_rate": 4.180910442924311e-07, "loss": 0.0119, "num_tokens": 46170456.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5006968975067139, "sampling/importance_sampling_ratio/mean": 0.9995827078819275, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.015751594677567482, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.35393333435058594, "epoch": 1.7965686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.7935912510779416, "kl": 0.030864953994750977, "learning_rate": 4.173883721997188e-07, "loss": -0.0084, "num_tokens": 46205400.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5241999626159668, "sampling/importance_sampling_ratio/mean": 0.999489426612854, "sampling/importance_sampling_ratio/min": 0.6623748540878296, "sampling/sampling_logp_difference/max": 0.42146968841552734, "sampling/sampling_logp_difference/mean": 0.013956794515252113, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 216.8125, "completions/mean_terminated_length": 216.8125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3866092264652252, "epoch": 1.7977941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.026090015442885556, "kl": 0.03396444022655487, "learning_rate": 4.1668586780337713e-07, "loss": 0.0004, "num_tokens": 46233532.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5496832132339478, "sampling/importance_sampling_ratio/mean": 0.9995602369308472, "sampling/importance_sampling_ratio/min": 0.6023508906364441, "sampling/sampling_logp_difference/max": 0.5069150924682617, "sampling/sampling_logp_difference/mean": 0.015118034556508064, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 202.34375, "completions/mean_terminated_length": 202.34375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3430424630641937, "epoch": 1.7990196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 1.084819651898561, "kl": 0.03494244068861008, "learning_rate": 4.159835325294457e-07, "loss": 0.0014, "num_tokens": 46259394.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4407918453216553, "sampling/importance_sampling_ratio/mean": 0.9999051094055176, "sampling/importance_sampling_ratio/min": 0.6133283376693726, "sampling/sampling_logp_difference/max": 0.48885488510131836, "sampling/sampling_logp_difference/mean": 0.01425672322511673, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 157.515625, "completions/mean_terminated_length": 157.515625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.39985328912734985, "epoch": 1.8002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02940842733886677, "kl": 0.0676310583949089, "learning_rate": 4.152813678036208e-07, "loss": 0.0006, "num_tokens": 46288051.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00049889087677, "sampling/importance_sampling_ratio/min": 0.6191496253013611, "sampling/sampling_logp_difference/max": 0.7354001998901367, "sampling/sampling_logp_difference/mean": 0.016767770051956177, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 191.28125, "completions/mean_terminated_length": 191.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4206755459308624, "epoch": 1.8014705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.4012676856935273, "kl": 0.0800822526216507, "learning_rate": 4.145793750512522e-07, "loss": 0.0496, "num_tokens": 46316501.0, "reward": -0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": -0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.6270153522491455, "sampling/importance_sampling_ratio/mean": 0.9992214441299438, "sampling/importance_sampling_ratio/min": 0.5298165082931519, "sampling/sampling_logp_difference/max": 0.6352245807647705, "sampling/sampling_logp_difference/mean": 0.017360102385282516, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 198.65625, "completions/mean_terminated_length": 198.65625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.42161741852760315, "epoch": 1.8026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.042891532642960015, "kl": 0.059884097427129745, "learning_rate": 4.1387755569734054e-07, "loss": 0.0006, "num_tokens": 46347743.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997487664222717, "sampling/importance_sampling_ratio/min": 0.614323616027832, "sampling/sampling_logp_difference/max": 0.7349467277526855, "sampling/sampling_logp_difference/mean": 0.015134022571146488, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 204.765625, "completions/mean_terminated_length": 204.765625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.39879900217056274, "epoch": 1.803921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.01481514788890377, "kl": 0.025549769401550293, "learning_rate": 4.131759111665348e-07, "loss": 0.0003, "num_tokens": 46381808.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5880461931228638, "sampling/importance_sampling_ratio/mean": 1.0000556707382202, "sampling/importance_sampling_ratio/min": 0.6291193962097168, "sampling/sampling_logp_difference/max": 0.46343421936035156, "sampling/sampling_logp_difference/mean": 0.01481297705322504, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 173.484375, "completions/mean_terminated_length": 173.484375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.40065518021583557, "epoch": 1.8051470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.05139028458355681, "kl": 0.07758373022079468, "learning_rate": 4.1247444288312895e-07, "loss": 0.0007, "num_tokens": 46410367.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.367601990699768, "sampling/importance_sampling_ratio/mean": 0.9998440146446228, "sampling/importance_sampling_ratio/min": 0.6487096548080444, "sampling/sampling_logp_difference/max": 0.4327700138092041, "sampling/sampling_logp_difference/mean": 0.016905371099710464, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 256.609375, "completions/mean_terminated_length": 256.609375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5070643424987793, "epoch": 1.8063725490196079, "frac_reward_zero_std": 0.5, "grad_norm": 1.0673609305951133, "kl": 0.04940847307443619, "learning_rate": 4.1177315227105926e-07, "loss": 0.0081, "num_tokens": 46447670.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6533323526382446, "sampling/importance_sampling_ratio/mean": 1.0003235340118408, "sampling/importance_sampling_ratio/min": 0.6956043839454651, "sampling/sampling_logp_difference/max": 0.5027928352355957, "sampling/sampling_logp_difference/mean": 0.017348986119031906, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 191.140625, "completions/mean_terminated_length": 191.140625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.36436885595321655, "epoch": 1.8075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7400154310148165, "kl": 0.05223090946674347, "learning_rate": 4.1107204075390096e-07, "loss": -0.0058, "num_tokens": 46473247.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3586506843566895, "sampling/importance_sampling_ratio/mean": 0.9999962449073792, "sampling/importance_sampling_ratio/min": 0.6302604079246521, "sampling/sampling_logp_difference/max": 0.4616222381591797, "sampling/sampling_logp_difference/mean": 0.013969759456813335, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 198.609375, "completions/mean_terminated_length": 198.609375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3275294303894043, "epoch": 1.8088235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.9667105337084376, "kl": 0.02661885693669319, "learning_rate": 4.1037110975486617e-07, "loss": -0.019, "num_tokens": 46503126.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5046790838241577, "sampling/importance_sampling_ratio/mean": 1.0004401206970215, "sampling/importance_sampling_ratio/min": 0.5255032181739807, "sampling/sampling_logp_difference/max": 0.6433990001678467, "sampling/sampling_logp_difference/mean": 0.013211781159043312, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 233.578125, "completions/mean_terminated_length": 233.578125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.33777767419815063, "epoch": 1.8100490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.026791804709812933, "kl": 0.025295440107584, "learning_rate": 4.096703606968006e-07, "loss": 0.0003, "num_tokens": 46535675.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6054742336273193, "sampling/importance_sampling_ratio/mean": 1.0002944469451904, "sampling/importance_sampling_ratio/min": 0.6139808297157288, "sampling/sampling_logp_difference/max": 0.4877915382385254, "sampling/sampling_logp_difference/mean": 0.0140210697427392, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 230.671875, "completions/mean_terminated_length": 230.671875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.40689587593078613, "epoch": 1.8112745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.014784997343498536, "kl": 0.025457806885242462, "learning_rate": 4.0896979500218014e-07, "loss": 0.0002, "num_tokens": 46575942.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5478439331054688, "sampling/importance_sampling_ratio/mean": 0.9997664093971252, "sampling/importance_sampling_ratio/min": 0.7019744515419006, "sampling/sampling_logp_difference/max": 0.4368629455566406, "sampling/sampling_logp_difference/mean": 0.0150857949629426, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 215.640625, "completions/mean_terminated_length": 215.640625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3654599189758301, "epoch": 1.8125, "frac_reward_zero_std": 1.0, "grad_norm": 0.017102247445561616, "kl": 0.03623443841934204, "learning_rate": 4.082694140931088e-07, "loss": 0.0004, "num_tokens": 46607343.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6253825426101685, "sampling/importance_sampling_ratio/mean": 0.9996971487998962, "sampling/importance_sampling_ratio/min": 0.4742974638938904, "sampling/sampling_logp_difference/max": 0.7459206581115723, "sampling/sampling_logp_difference/mean": 0.013516171835362911, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 185.03125, "completions/mean_terminated_length": 185.03125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.388782799243927, "epoch": 1.8137254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.8009492885848263, "kl": 0.046929892152547836, "learning_rate": 4.0756921939131563e-07, "loss": -0.0025, "num_tokens": 46636369.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3943331241607666, "sampling/importance_sampling_ratio/mean": 0.9998328685760498, "sampling/importance_sampling_ratio/min": 0.5221091508865356, "sampling/sampling_logp_difference/max": 0.6498786211013794, "sampling/sampling_logp_difference/mean": 0.01529900822788477, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 209.4375, "completions/mean_terminated_length": 209.4375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.505455493927002, "epoch": 1.8149509803921569, "frac_reward_zero_std": 0.5, "grad_norm": 1.2395268664872279, "kl": 0.05534018576145172, "learning_rate": 4.0686921231815155e-07, "loss": 0.01, "num_tokens": 46667837.0, "reward": 0.0625, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6977012157440186, "sampling/importance_sampling_ratio/mean": 0.998990535736084, "sampling/importance_sampling_ratio/min": 0.5189392566680908, "sampling/sampling_logp_difference/max": 0.655968427658081, "sampling/sampling_logp_difference/mean": 0.018538698554039, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 185.359375, "completions/mean_terminated_length": 185.359375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.35948121547698975, "epoch": 1.8161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02318190868571896, "kl": 0.035717546939849854, "learning_rate": 4.0616939429458627e-07, "loss": 0.0004, "num_tokens": 46694036.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6129748821258545, "sampling/importance_sampling_ratio/mean": 0.9997787475585938, "sampling/importance_sampling_ratio/min": 0.5944111943244934, "sampling/sampling_logp_difference/max": 0.5201840400695801, "sampling/sampling_logp_difference/mean": 0.015287473797798157, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.326416552066803, "epoch": 1.8174019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.018925789830675076, "kl": 0.02713128924369812, "learning_rate": 4.0546976674120623e-07, "loss": 0.0003, "num_tokens": 46724636.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5976483821868896, "sampling/importance_sampling_ratio/mean": 1.0004806518554688, "sampling/importance_sampling_ratio/min": 0.7305005192756653, "sampling/sampling_logp_difference/max": 0.4685328006744385, "sampling/sampling_logp_difference/mean": 0.013860877603292465, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 225.0625, "completions/mean_terminated_length": 225.0625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4279215633869171, "epoch": 1.8186274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 1.0377987253461263, "kl": 0.049170322716236115, "learning_rate": 4.047703310782111e-07, "loss": 0.013, "num_tokens": 46762304.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6267346143722534, "sampling/importance_sampling_ratio/mean": 1.0001940727233887, "sampling/importance_sampling_ratio/min": 0.6485505104064941, "sampling/sampling_logp_difference/max": 0.486574649810791, "sampling/sampling_logp_difference/mean": 0.01635306142270565, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.39828744530677795, "epoch": 1.8198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.023462238047806762, "kl": 0.03244244307279587, "learning_rate": 4.0407108872541105e-07, "loss": 0.0003, "num_tokens": 46797776.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6390002965927124, "sampling/importance_sampling_ratio/mean": 1.0000399351119995, "sampling/importance_sampling_ratio/min": 0.6574017405509949, "sampling/sampling_logp_difference/max": 0.49408650398254395, "sampling/sampling_logp_difference/mean": 0.014169460162520409, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.37603577971458435, "epoch": 1.821078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.8349295274843194, "kl": 0.026338636875152588, "learning_rate": 4.0337204110222347e-07, "loss": 0.0111, "num_tokens": 46829304.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6245514154434204, "sampling/importance_sampling_ratio/mean": 0.9991324543952942, "sampling/importance_sampling_ratio/min": 0.6294820308685303, "sampling/sampling_logp_difference/max": 0.4852316379547119, "sampling/sampling_logp_difference/mean": 0.016620833426713943, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 196.140625, "completions/mean_terminated_length": 196.140625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.348731130361557, "epoch": 1.8223039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.05716206847792711, "kl": 0.04719793424010277, "learning_rate": 4.0267318962767076e-07, "loss": 0.0004, "num_tokens": 46859553.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5071709156036377, "sampling/importance_sampling_ratio/mean": 0.9999433755874634, "sampling/importance_sampling_ratio/min": 0.6129412651062012, "sampling/sampling_logp_difference/max": 0.4894862174987793, "sampling/sampling_logp_difference/mean": 0.015518147498369217, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 159.546875, "completions/mean_terminated_length": 159.546875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.306646466255188, "epoch": 1.8235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0361161905300358, "kl": 0.03056943230330944, "learning_rate": 4.0197453572037747e-07, "loss": 0.0003, "num_tokens": 46887892.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4020594358444214, "sampling/importance_sampling_ratio/mean": 1.0001198053359985, "sampling/importance_sampling_ratio/min": 0.621240496635437, "sampling/sampling_logp_difference/max": 0.47603702545166016, "sampling/sampling_logp_difference/mean": 0.012928012758493423, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 225.8125, "completions/mean_terminated_length": 225.8125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.42293429374694824, "epoch": 1.8247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.1095277333689655, "kl": 0.03491745889186859, "learning_rate": 4.0127608079856644e-07, "loss": -0.0007, "num_tokens": 46917544.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4753106832504272, "sampling/importance_sampling_ratio/mean": 1.0002186298370361, "sampling/importance_sampling_ratio/min": 0.47906020283699036, "sampling/sampling_logp_difference/max": 0.735929012298584, "sampling/sampling_logp_difference/mean": 0.01622503250837326, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 212.65625, "completions/mean_terminated_length": 212.65625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.36998146772384644, "epoch": 1.8259803921568627, "frac_reward_zero_std": 1.0, "grad_norm": 0.016325980401073328, "kl": 0.025385094806551933, "learning_rate": 4.005778262800571e-07, "loss": 0.0003, "num_tokens": 46950994.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002012252807617, "sampling/importance_sampling_ratio/min": 0.6067395210266113, "sampling/sampling_logp_difference/max": 0.853581428527832, "sampling/sampling_logp_difference/mean": 0.01505836471915245, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 196.78125, "completions/mean_terminated_length": 196.78125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3660886883735657, "epoch": 1.8272058823529411, "frac_reward_zero_std": 1.0, "grad_norm": 0.016446958710568508, "kl": 0.023385537788271904, "learning_rate": 3.9987977358226175e-07, "loss": 0.0002, "num_tokens": 46984164.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4265543222427368, "sampling/importance_sampling_ratio/mean": 1.0011383295059204, "sampling/importance_sampling_ratio/min": 0.6771497130393982, "sampling/sampling_logp_difference/max": 0.38986289501190186, "sampling/sampling_logp_difference/mean": 0.015456505119800568, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 237.671875, "completions/mean_terminated_length": 237.671875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.530724048614502, "epoch": 1.8284313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.7448468760999643, "kl": 0.04840172827243805, "learning_rate": 3.991819241221835e-07, "loss": 0.0322, "num_tokens": 47031519.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4663141965866089, "sampling/importance_sampling_ratio/mean": 0.9994192719459534, "sampling/importance_sampling_ratio/min": 0.6312842965126038, "sampling/sampling_logp_difference/max": 0.4599989652633667, "sampling/sampling_logp_difference/mean": 0.017024997621774673, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 239.78125, "completions/mean_terminated_length": 239.78125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.47953343391418457, "epoch": 1.829656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.017750015634154247, "kl": 0.03577635809779167, "learning_rate": 3.98484279316412e-07, "loss": 0.0004, "num_tokens": 47068321.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4650589227676392, "sampling/importance_sampling_ratio/mean": 1.0002245903015137, "sampling/importance_sampling_ratio/min": 0.6398684978485107, "sampling/sampling_logp_difference/max": 0.44649267196655273, "sampling/sampling_logp_difference/mean": 0.016227155923843384, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 199.40625, "completions/mean_terminated_length": 199.40625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3214770555496216, "epoch": 1.8308823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.017367003899370518, "kl": 0.02642001025378704, "learning_rate": 3.977868405811223e-07, "loss": 0.0003, "num_tokens": 47096475.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6212598085403442, "sampling/importance_sampling_ratio/mean": 1.0001227855682373, "sampling/importance_sampling_ratio/min": 0.6142379641532898, "sampling/sampling_logp_difference/max": 0.48737287521362305, "sampling/sampling_logp_difference/mean": 0.01426103338599205, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 202.390625, "completions/mean_terminated_length": 202.390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.36925238370895386, "epoch": 1.8321078431372548, "frac_reward_zero_std": 1.0, "grad_norm": 0.01574239035710109, "kl": 0.023774582892656326, "learning_rate": 3.970896093320708e-07, "loss": 0.0002, "num_tokens": 47127012.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.326669692993164, "sampling/importance_sampling_ratio/mean": 0.9998118877410889, "sampling/importance_sampling_ratio/min": 0.5278945565223694, "sampling/sampling_logp_difference/max": 0.6388587951660156, "sampling/sampling_logp_difference/mean": 0.014183331280946732, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 194.53125, "completions/mean_terminated_length": 194.53125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.37928807735443115, "epoch": 1.8333333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.01891790696330377, "kl": 0.02853918820619583, "learning_rate": 3.9639258698459287e-07, "loss": 0.0003, "num_tokens": 47157222.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4152483940124512, "sampling/importance_sampling_ratio/mean": 1.000333309173584, "sampling/importance_sampling_ratio/min": 0.6547790765762329, "sampling/sampling_logp_difference/max": 0.42345738410949707, "sampling/sampling_logp_difference/mean": 0.014429192990064621, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 230.171875, "completions/mean_terminated_length": 230.171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.29474058747291565, "epoch": 1.8345588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.016042095422497977, "kl": 0.020352240651845932, "learning_rate": 3.9569577495359964e-07, "loss": 0.0002, "num_tokens": 47190641.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.9997077584266663, "sampling/importance_sampling_ratio/min": 0.6276949048042297, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.011500919237732887, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 203.578125, "completions/mean_terminated_length": 203.578125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.47230902314186096, "epoch": 1.8357843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.015473087539015013, "kl": 0.028497405350208282, "learning_rate": 3.949991746535753e-07, "loss": 0.0003, "num_tokens": 47220838.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.358568787574768, "sampling/importance_sampling_ratio/mean": 0.9996574521064758, "sampling/importance_sampling_ratio/min": 0.5100560188293457, "sampling/sampling_logp_difference/max": 0.6732347011566162, "sampling/sampling_logp_difference/mean": 0.01717330515384674, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 203.328125, "completions/mean_terminated_length": 203.328125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3301214277744293, "epoch": 1.8370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.017093617533739433, "kl": 0.026626810431480408, "learning_rate": 3.943027874985746e-07, "loss": 0.0003, "num_tokens": 47254683.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5623085498809814, "sampling/importance_sampling_ratio/mean": 1.0003242492675781, "sampling/importance_sampling_ratio/min": 0.6190605163574219, "sampling/sampling_logp_difference/max": 0.4795522689819336, "sampling/sampling_logp_difference/mean": 0.013283345848321915, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 206.140625, "completions/mean_terminated_length": 206.140625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3752307891845703, "epoch": 1.8382352941176472, "frac_reward_zero_std": 1.0, "grad_norm": 0.015116664729821374, "kl": 0.023343995213508606, "learning_rate": 3.9360661490221904e-07, "loss": 0.0002, "num_tokens": 47293956.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6121180057525635, "sampling/importance_sampling_ratio/mean": 0.9999548196792603, "sampling/importance_sampling_ratio/min": 0.6824631690979004, "sampling/sampling_logp_difference/max": 0.47754883766174316, "sampling/sampling_logp_difference/mean": 0.01394678931683302, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 214.578125, "completions/mean_terminated_length": 214.578125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31524908542633057, "epoch": 1.8394607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.7801255082675378, "kl": 0.030825018882751465, "learning_rate": 3.929106582776948e-07, "loss": 0.0721, "num_tokens": 47325033.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.639209508895874, "sampling/importance_sampling_ratio/mean": 0.9999526143074036, "sampling/importance_sampling_ratio/min": 0.6328410506248474, "sampling/sampling_logp_difference/max": 0.4942140579223633, "sampling/sampling_logp_difference/mean": 0.013170123100280762, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 183.8125, "completions/mean_terminated_length": 183.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4793202877044678, "epoch": 1.840686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.8663195262513032, "kl": 0.04631558060646057, "learning_rate": 3.9221491903775013e-07, "loss": -0.031, "num_tokens": 47358893.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.606947898864746, "sampling/importance_sampling_ratio/mean": 1.0002026557922363, "sampling/importance_sampling_ratio/min": 0.5907023549079895, "sampling/sampling_logp_difference/max": 0.5264430046081543, "sampling/sampling_logp_difference/mean": 0.01629520207643509, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4045953154563904, "epoch": 1.8419117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.7631893991916184, "kl": 0.04685451462864876, "learning_rate": 3.9151939859469166e-07, "loss": 0.0155, "num_tokens": 47391165.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.2985237836837769, "sampling/importance_sampling_ratio/mean": 1.0000450611114502, "sampling/importance_sampling_ratio/min": 0.6058517098426819, "sampling/sampling_logp_difference/max": 0.5011200904846191, "sampling/sampling_logp_difference/mean": 0.01413068175315857, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 192.03125, "completions/mean_terminated_length": 192.03125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.34464895725250244, "epoch": 1.843137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.01662504791497207, "kl": 0.025617942214012146, "learning_rate": 3.908240983603813e-07, "loss": 0.0003, "num_tokens": 47422383.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4113905429840088, "sampling/importance_sampling_ratio/mean": 0.9997320175170898, "sampling/importance_sampling_ratio/min": 0.62844318151474, "sampling/sampling_logp_difference/max": 0.4645097255706787, "sampling/sampling_logp_difference/mean": 0.013034269213676453, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 210.8125, "completions/mean_terminated_length": 210.8125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3480450510978699, "epoch": 1.844362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.7958356948308606, "kl": 0.03220169246196747, "learning_rate": 3.9012901974623476e-07, "loss": 0.0099, "num_tokens": 47450707.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3982449769973755, "sampling/importance_sampling_ratio/mean": 0.9996621608734131, "sampling/importance_sampling_ratio/min": 0.6385641098022461, "sampling/sampling_logp_difference/max": 0.448533296585083, "sampling/sampling_logp_difference/mean": 0.013606126420199871, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.453193336725235, "epoch": 1.8455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.9597914881370286, "kl": 0.04644181951880455, "learning_rate": 3.894341641632176e-07, "loss": -0.0137, "num_tokens": 47486703.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5226556062698364, "sampling/importance_sampling_ratio/mean": 1.0002065896987915, "sampling/importance_sampling_ratio/min": 0.6109529137611389, "sampling/sampling_logp_difference/max": 0.4927353858947754, "sampling/sampling_logp_difference/mean": 0.016492381691932678, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 253.53125, "completions/mean_terminated_length": 253.53125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.41846609115600586, "epoch": 1.846813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.019047337485516454, "kl": 0.04174449294805527, "learning_rate": 3.8873953302184283e-07, "loss": 0.0004, "num_tokens": 47522833.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3754518032073975, "sampling/importance_sampling_ratio/mean": 0.999981701374054, "sampling/importance_sampling_ratio/min": 0.6383336186408997, "sampling/sampling_logp_difference/max": 0.4488942623138428, "sampling/sampling_logp_difference/mean": 0.014663152396678925, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 226.265625, "completions/mean_terminated_length": 226.265625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.36275559663772583, "epoch": 1.8480392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0279651867417821, "kl": 0.031749993562698364, "learning_rate": 3.880451277321673e-07, "loss": 0.0003, "num_tokens": 47555106.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.424131155014038, "sampling/importance_sampling_ratio/mean": 0.9998042583465576, "sampling/importance_sampling_ratio/min": 0.56624436378479, "sampling/sampling_logp_difference/max": 0.5687295198440552, "sampling/sampling_logp_difference/mean": 0.014154046773910522, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 155.078125, "completions/mean_terminated_length": 155.078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.31655555963516235, "epoch": 1.8492647058823528, "frac_reward_zero_std": 1.0, "grad_norm": 0.021751454697667986, "kl": 0.026957426220178604, "learning_rate": 3.873509497037899e-07, "loss": 0.0003, "num_tokens": 47584391.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4381095170974731, "sampling/importance_sampling_ratio/mean": 1.0002214908599854, "sampling/importance_sampling_ratio/min": 0.6433576941490173, "sampling/sampling_logp_difference/max": 0.44105446338653564, "sampling/sampling_logp_difference/mean": 0.012428762391209602, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 208.6875, "completions/mean_terminated_length": 208.6875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.32220640778541565, "epoch": 1.8504901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.015277402866472353, "kl": 0.024374518543481827, "learning_rate": 3.8665700034584834e-07, "loss": 0.0002, "num_tokens": 47617091.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.575573205947876, "sampling/importance_sampling_ratio/mean": 0.9997588992118835, "sampling/importance_sampling_ratio/min": 0.5096511840820312, "sampling/sampling_logp_difference/max": 0.674028754234314, "sampling/sampling_logp_difference/mean": 0.01380470022559166, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 165.578125, "completions/mean_terminated_length": 165.578125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.33295172452926636, "epoch": 1.8517156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.05001211305223302, "kl": 0.04372579604387283, "learning_rate": 3.8596328106701533e-07, "loss": 0.0004, "num_tokens": 47641656.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.554552435874939, "sampling/importance_sampling_ratio/mean": 1.0006695985794067, "sampling/importance_sampling_ratio/min": 0.6678668260574341, "sampling/sampling_logp_difference/max": 0.44118762016296387, "sampling/sampling_logp_difference/mean": 0.014502106234431267, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 186.90625, "completions/mean_terminated_length": 186.90625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4239566922187805, "epoch": 1.8529411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.023184292748248538, "kl": 0.03840119391679764, "learning_rate": 3.8526979327549736e-07, "loss": 0.0004, "num_tokens": 47676370.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5825395584106445, "sampling/importance_sampling_ratio/mean": 1.000715732574463, "sampling/importance_sampling_ratio/min": 0.6703637838363647, "sampling/sampling_logp_difference/max": 0.4590308666229248, "sampling/sampling_logp_difference/mean": 0.015674592927098274, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 195.53125, "completions/mean_terminated_length": 195.53125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.35997116565704346, "epoch": 1.8541666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.8710103041200632, "kl": 0.03831769526004791, "learning_rate": 3.845765383790306e-07, "loss": 0.0008, "num_tokens": 47704772.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3383262157440186, "sampling/importance_sampling_ratio/mean": 0.9996328949928284, "sampling/importance_sampling_ratio/min": 0.6200441718101501, "sampling/sampling_logp_difference/max": 0.4779646396636963, "sampling/sampling_logp_difference/mean": 0.015365565195679665, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 216.265625, "completions/mean_terminated_length": 216.265625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4190855622291565, "epoch": 1.8553921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.01935169307066362, "kl": 0.03445492312312126, "learning_rate": 3.8388351778487875e-07, "loss": 0.0004, "num_tokens": 47737845.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4253652095794678, "sampling/importance_sampling_ratio/mean": 0.9997737407684326, "sampling/importance_sampling_ratio/min": 0.7136714458465576, "sampling/sampling_logp_difference/max": 0.3544280529022217, "sampling/sampling_logp_difference/mean": 0.0158874299377203, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 201.796875, "completions/mean_terminated_length": 201.796875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.38299238681793213, "epoch": 1.8566176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.015362492473837216, "kl": 0.02534967102110386, "learning_rate": 3.831907328998295e-07, "loss": 0.0003, "num_tokens": 47770328.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4909775257110596, "sampling/importance_sampling_ratio/mean": 1.0000851154327393, "sampling/importance_sampling_ratio/min": 0.7270282506942749, "sampling/sampling_logp_difference/max": 0.3994319438934326, "sampling/sampling_logp_difference/mean": 0.015265097841620445, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3300290107727051, "epoch": 1.857843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.017287688114681694, "kl": 0.030581336468458176, "learning_rate": 3.824981851301924e-07, "loss": 0.0003, "num_tokens": 47800260.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6193667650222778, "sampling/importance_sampling_ratio/mean": 1.0000019073486328, "sampling/importance_sampling_ratio/min": 0.6434445977210999, "sampling/sampling_logp_difference/max": 0.48203516006469727, "sampling/sampling_logp_difference/mean": 0.013907796703279018, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.39566364884376526, "epoch": 1.8590686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.021384363972712102, "kl": 0.03867260366678238, "learning_rate": 3.818058758817955e-07, "loss": 0.0003, "num_tokens": 47833652.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4483669996261597, "sampling/importance_sampling_ratio/mean": 0.9995390176773071, "sampling/importance_sampling_ratio/min": 0.6335282921791077, "sampling/sampling_logp_difference/max": 0.4564507007598877, "sampling/sampling_logp_difference/mean": 0.015130220912396908, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 215.140625, "completions/mean_terminated_length": 215.140625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.37887823581695557, "epoch": 1.8602941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 1.1560713613113136, "kl": 0.03132549673318863, "learning_rate": 3.81113806559983e-07, "loss": -0.0081, "num_tokens": 47863229.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4703065156936646, "sampling/importance_sampling_ratio/mean": 1.0002840757369995, "sampling/importance_sampling_ratio/min": 0.6794388890266418, "sampling/sampling_logp_difference/max": 0.3864879608154297, "sampling/sampling_logp_difference/mean": 0.015126525424420834, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 161.578125, "completions/mean_terminated_length": 161.578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4315345585346222, "epoch": 1.8615196078431373, "frac_reward_zero_std": 0.5, "grad_norm": 1.42162824891901, "kl": 0.11316701024770737, "learning_rate": 3.804219785696113e-07, "loss": 0.0433, "num_tokens": 47887698.0, "reward": 0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.283816933631897, "sampling/importance_sampling_ratio/mean": 0.999412477016449, "sampling/importance_sampling_ratio/min": 0.608467698097229, "sampling/sampling_logp_difference/max": 0.4968113899230957, "sampling/sampling_logp_difference/mean": 0.016700876876711845, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 164.828125, "completions/mean_terminated_length": 164.828125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.30290788412094116, "epoch": 1.8627450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.018151543106306196, "kl": 0.026134612038731575, "learning_rate": 3.797303933150475e-07, "loss": 0.0002, "num_tokens": 47912551.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5077170133590698, "sampling/importance_sampling_ratio/mean": 1.00059974193573, "sampling/importance_sampling_ratio/min": 0.6030475497245789, "sampling/sampling_logp_difference/max": 0.5057592391967773, "sampling/sampling_logp_difference/mean": 0.013300842605531216, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 235.734375, "completions/mean_terminated_length": 235.734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3396828770637512, "epoch": 1.8639705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.01272104473429735, "kl": 0.02101752534508705, "learning_rate": 3.790390522001662e-07, "loss": 0.0002, "num_tokens": 47949382.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8631864786148071, "sampling/importance_sampling_ratio/mean": 1.00046706199646, "sampling/importance_sampling_ratio/min": 0.6420100927352905, "sampling/sampling_logp_difference/max": 0.6222882270812988, "sampling/sampling_logp_difference/mean": 0.01321179885417223, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 222.359375, "completions/mean_terminated_length": 222.359375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3400956392288208, "epoch": 1.8651960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0161972383848884, "kl": 0.026999952271580696, "learning_rate": 3.7834795662834566e-07, "loss": 0.0002, "num_tokens": 47980733.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.644083023071289, "sampling/importance_sampling_ratio/mean": 1.000594973564148, "sampling/importance_sampling_ratio/min": 0.5639978647232056, "sampling/sampling_logp_difference/max": 0.5727047920227051, "sampling/sampling_logp_difference/mean": 0.013366002589464188, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 286.015625, "completions/mean_terminated_length": 286.015625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.47846367955207825, "epoch": 1.866421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.5874284622063278, "kl": 0.025258827954530716, "learning_rate": 3.776571080024663e-07, "loss": 0.0091, "num_tokens": 48022814.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4995115995407104, "sampling/importance_sampling_ratio/mean": 1.0000669956207275, "sampling/importance_sampling_ratio/min": 0.6933954954147339, "sampling/sampling_logp_difference/max": 0.4051394462585449, "sampling/sampling_logp_difference/mean": 0.015736836940050125, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 230.796875, "completions/mean_terminated_length": 230.796875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3768925666809082, "epoch": 1.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.015557378134246427, "kl": 0.02143799141049385, "learning_rate": 3.76966507724907e-07, "loss": 0.0002, "num_tokens": 48062049.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4400556087493896, "sampling/importance_sampling_ratio/mean": 1.000324010848999, "sampling/importance_sampling_ratio/min": 0.6300141215324402, "sampling/sampling_logp_difference/max": 0.46201300621032715, "sampling/sampling_logp_difference/mean": 0.01351095736026764, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 262.21875, "completions/mean_terminated_length": 262.21875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.47479164600372314, "epoch": 1.8688725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 0.7609147646390458, "kl": 0.03068857453763485, "learning_rate": 3.762761571975429e-07, "loss": -0.0246, "num_tokens": 48101055.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4294888973236084, "sampling/importance_sampling_ratio/mean": 1.0001832246780396, "sampling/importance_sampling_ratio/min": 0.6955002546310425, "sampling/sampling_logp_difference/max": 0.36312389373779297, "sampling/sampling_logp_difference/mean": 0.01597488485276699, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 190.15625, "completions/mean_terminated_length": 190.15625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3384573459625244, "epoch": 1.8700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.022967983325005957, "kl": 0.04190897196531296, "learning_rate": 3.755860578217413e-07, "loss": 0.0005, "num_tokens": 48132617.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6337945461273193, "sampling/importance_sampling_ratio/mean": 0.99971604347229, "sampling/importance_sampling_ratio/min": 0.677150309085846, "sampling/sampling_logp_difference/max": 0.4909052848815918, "sampling/sampling_logp_difference/mean": 0.01371623296290636, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3548075556755066, "epoch": 1.8713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016443605834087695, "kl": 0.026354052126407623, "learning_rate": 3.7489621099836043e-07, "loss": 0.0003, "num_tokens": 48162521.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4063249826431274, "sampling/importance_sampling_ratio/mean": 0.9999675750732422, "sampling/importance_sampling_ratio/min": 0.7368854284286499, "sampling/sampling_logp_difference/max": 0.3409799337387085, "sampling/sampling_logp_difference/mean": 0.013621061109006405, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3396419286727905, "epoch": 1.8725490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.03021546402426761, "kl": 0.035038430243730545, "learning_rate": 3.742066181277457e-07, "loss": 0.0003, "num_tokens": 48197305.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7287288904190063, "sampling/importance_sampling_ratio/mean": 1.000309944152832, "sampling/importance_sampling_ratio/min": 0.6089730858802795, "sampling/sampling_logp_difference/max": 0.5473864078521729, "sampling/sampling_logp_difference/mean": 0.013034914620220661, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 215.859375, "completions/mean_terminated_length": 215.859375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4406580924987793, "epoch": 1.8737745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.9237220374056536, "kl": 0.04749206081032753, "learning_rate": 3.735172806097271e-07, "loss": 0.0224, "num_tokens": 48231536.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.53892982006073, "sampling/importance_sampling_ratio/mean": 0.9993038177490234, "sampling/importance_sampling_ratio/min": 0.6317639350891113, "sampling/sampling_logp_difference/max": 0.45923948287963867, "sampling/sampling_logp_difference/mean": 0.015829021111130714, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 195.328125, "completions/mean_terminated_length": 195.328125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3406910300254822, "epoch": 1.875, "frac_reward_zero_std": 0.75, "grad_norm": 0.6532458550494024, "kl": 0.026171743869781494, "learning_rate": 3.7282819984361577e-07, "loss": 0.0028, "num_tokens": 48261893.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5282044410705566, "sampling/importance_sampling_ratio/mean": 0.9998431205749512, "sampling/importance_sampling_ratio/min": 0.6920354962348938, "sampling/sampling_logp_difference/max": 0.42409348487854004, "sampling/sampling_logp_difference/mean": 0.01412119995802641, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 205.859375, "completions/mean_terminated_length": 205.859375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.36345553398132324, "epoch": 1.8762254901960784, "frac_reward_zero_std": 0.75, "grad_norm": 0.805099297675102, "kl": 0.027597036212682724, "learning_rate": 3.721393772282022e-07, "loss": -0.0021, "num_tokens": 48290988.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4377270936965942, "sampling/importance_sampling_ratio/mean": 1.0010266304016113, "sampling/importance_sampling_ratio/min": 0.6303300857543945, "sampling/sampling_logp_difference/max": 0.46151161193847656, "sampling/sampling_logp_difference/mean": 0.015081214718520641, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 196.34375, "completions/mean_terminated_length": 196.34375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.42034146189689636, "epoch": 1.8774509803921569, "frac_reward_zero_std": 0.75, "grad_norm": 0.9478155692519119, "kl": 0.02742549404501915, "learning_rate": 3.7145081416175264e-07, "loss": 0.0004, "num_tokens": 48321250.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4818246364593506, "sampling/importance_sampling_ratio/mean": 0.9998699426651001, "sampling/importance_sampling_ratio/min": 0.7189988493919373, "sampling/sampling_logp_difference/max": 0.393274188041687, "sampling/sampling_logp_difference/mean": 0.014403371140360832, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.425557017326355, "epoch": 1.8786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.026734386894002888, "kl": 0.053220491856336594, "learning_rate": 3.7076251204200667e-07, "loss": 0.0005, "num_tokens": 48352794.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007447242736816, "sampling/importance_sampling_ratio/mean": 0.9991117715835571, "sampling/importance_sampling_ratio/min": 0.7116121053695679, "sampling/sampling_logp_difference/max": 0.47046899795532227, "sampling/sampling_logp_difference/mean": 0.014125547371804714, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 182.671875, "completions/mean_terminated_length": 182.671875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3695541024208069, "epoch": 1.8799019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.018940723807235122, "kl": 0.03589736297726631, "learning_rate": 3.700744722661736e-07, "loss": 0.0004, "num_tokens": 48378533.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5425060987472534, "sampling/importance_sampling_ratio/mean": 0.9998379945755005, "sampling/importance_sampling_ratio/min": 0.6880703568458557, "sampling/sampling_logp_difference/max": 0.4334084987640381, "sampling/sampling_logp_difference/mean": 0.01465071551501751, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.37948712706565857, "epoch": 1.8811274509803921, "frac_reward_zero_std": 1.0, "grad_norm": 0.018318198725694557, "kl": 0.028661314398050308, "learning_rate": 3.693866962309308e-07, "loss": 0.0003, "num_tokens": 48409437.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5045616626739502, "sampling/importance_sampling_ratio/mean": 0.999610185623169, "sampling/importance_sampling_ratio/min": 0.5768758654594421, "sampling/sampling_logp_difference/max": 0.5501282215118408, "sampling/sampling_logp_difference/mean": 0.015293458476662636, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 185.578125, "completions/mean_terminated_length": 185.578125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.5331075191497803, "epoch": 1.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.05196876691264794, "kl": 0.07588602602481842, "learning_rate": 3.686991853324202e-07, "loss": 0.0008, "num_tokens": 48440530.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5011667013168335, "sampling/importance_sampling_ratio/mean": 0.999535322189331, "sampling/importance_sampling_ratio/min": 0.6088920831680298, "sampling/sampling_logp_difference/max": 0.49611425399780273, "sampling/sampling_logp_difference/mean": 0.018779968842864037, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 217.0625, "completions/mean_terminated_length": 217.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.32210832834243774, "epoch": 1.883578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.017685293873126322, "kl": 0.02645273134112358, "learning_rate": 3.680119409662451e-07, "loss": 0.0002, "num_tokens": 48471222.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.635670781135559, "sampling/importance_sampling_ratio/mean": 1.0004723072052002, "sampling/importance_sampling_ratio/min": 0.6056347489356995, "sampling/sampling_logp_difference/max": 0.5014781951904297, "sampling/sampling_logp_difference/mean": 0.012803269550204277, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 208.4375, "completions/mean_terminated_length": 208.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.39927926659584045, "epoch": 1.8848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.015047651396016628, "kl": 0.02834579348564148, "learning_rate": 3.673249645274682e-07, "loss": 0.0003, "num_tokens": 48502898.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5285691022872925, "sampling/importance_sampling_ratio/mean": 1.0005877017974854, "sampling/importance_sampling_ratio/min": 0.6625506281852722, "sampling/sampling_logp_difference/max": 0.4243321418762207, "sampling/sampling_logp_difference/mean": 0.014671064913272858, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 187.171875, "completions/mean_terminated_length": 187.171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4136110544204712, "epoch": 1.8860294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.9895108502647249, "kl": 0.036319661885499954, "learning_rate": 3.6663825741060805e-07, "loss": 0.0123, "num_tokens": 48534957.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.844038486480713, "sampling/importance_sampling_ratio/mean": 0.9997587203979492, "sampling/importance_sampling_ratio/min": 0.6100226044654846, "sampling/sampling_logp_difference/max": 0.6119580268859863, "sampling/sampling_logp_difference/mean": 0.016377631574869156, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 213.390625, "completions/mean_terminated_length": 213.390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4372606575489044, "epoch": 1.8872549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.9155552613239852, "kl": 0.03465927392244339, "learning_rate": 3.6595182100963686e-07, "loss": -0.0025, "num_tokens": 48564294.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5823155641555786, "sampling/importance_sampling_ratio/mean": 1.0003228187561035, "sampling/importance_sampling_ratio/min": 0.6966019868850708, "sampling/sampling_logp_difference/max": 0.4588892459869385, "sampling/sampling_logp_difference/mean": 0.015279426239430904, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 201.28125, "completions/mean_terminated_length": 201.28125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.44704121351242065, "epoch": 1.8884803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.0935355314855688, "kl": 0.05873614177107811, "learning_rate": 3.652656567179765e-07, "loss": 0.0051, "num_tokens": 48592648.0, "reward": 0.46875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3280521631240845, "sampling/importance_sampling_ratio/mean": 1.0003440380096436, "sampling/importance_sampling_ratio/min": 0.7045369744300842, "sampling/sampling_logp_difference/max": 0.35021448135375977, "sampling/sampling_logp_difference/mean": 0.016065191477537155, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 199.71875, "completions/mean_terminated_length": 199.71875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3954826891422272, "epoch": 1.8897058823529411, "frac_reward_zero_std": 0.75, "grad_norm": 0.6614945317414733, "kl": 0.024616148322820663, "learning_rate": 3.645797659284975e-07, "loss": -0.0268, "num_tokens": 48620118.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.43571937084198, "sampling/importance_sampling_ratio/mean": 1.0004339218139648, "sampling/importance_sampling_ratio/min": 0.6173369288444519, "sampling/sampling_logp_difference/max": 0.48234033584594727, "sampling/sampling_logp_difference/mean": 0.014273724518716335, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 201.890625, "completions/mean_terminated_length": 201.890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4209592938423157, "epoch": 1.8909313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.015838906829838583, "kl": 0.03292512893676758, "learning_rate": 3.638941500335144e-07, "loss": 0.0003, "num_tokens": 48650751.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.448422908782959, "sampling/importance_sampling_ratio/mean": 1.0003647804260254, "sampling/importance_sampling_ratio/min": 0.6615846753120422, "sampling/sampling_logp_difference/max": 0.41311728954315186, "sampling/sampling_logp_difference/mean": 0.014716507866978645, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 236.109375, "completions/mean_terminated_length": 236.109375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4740673005580902, "epoch": 1.892156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.043215480833453, "kl": 0.047959618270397186, "learning_rate": 3.6320881042478433e-07, "loss": -0.0196, "num_tokens": 48681990.0, "reward": 0.34375, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6622728109359741, "sampling/importance_sampling_ratio/mean": 1.0003360509872437, "sampling/importance_sampling_ratio/min": 0.626307487487793, "sampling/sampling_logp_difference/max": 0.508185863494873, "sampling/sampling_logp_difference/mean": 0.016803443431854248, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 160.515625, "completions/mean_terminated_length": 160.515625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.342548668384552, "epoch": 1.8933823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.022317383834636773, "kl": 0.03253442049026489, "learning_rate": 3.6252374849350303e-07, "loss": 0.0003, "num_tokens": 48710743.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9671460390090942, "sampling/importance_sampling_ratio/mean": 0.9998558759689331, "sampling/importance_sampling_ratio/min": 0.5887016654014587, "sampling/sampling_logp_difference/max": 0.6765837669372559, "sampling/sampling_logp_difference/mean": 0.014454076066613197, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 203.40625, "completions/mean_terminated_length": 203.40625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3158635199069977, "epoch": 1.8946078431372548, "frac_reward_zero_std": 0.75, "grad_norm": 0.7180401122204123, "kl": 0.02597656473517418, "learning_rate": 3.618389656303029e-07, "loss": -0.0126, "num_tokens": 48742561.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4308671951293945, "sampling/importance_sampling_ratio/mean": 1.0004360675811768, "sampling/importance_sampling_ratio/min": 0.6622448563575745, "sampling/sampling_logp_difference/max": 0.41211986541748047, "sampling/sampling_logp_difference/mean": 0.01251291949301958, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 213.203125, "completions/mean_terminated_length": 213.203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.4119238257408142, "epoch": 1.8958333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.023738719517323002, "kl": 0.04377554729580879, "learning_rate": 3.6115446322525e-07, "loss": 0.0005, "num_tokens": 48777742.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4875625371932983, "sampling/importance_sampling_ratio/mean": 0.9997202754020691, "sampling/importance_sampling_ratio/min": 0.722936749458313, "sampling/sampling_logp_difference/max": 0.3971388339996338, "sampling/sampling_logp_difference/mean": 0.01472897082567215, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4818275272846222, "epoch": 1.8970588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.7594003267221461, "kl": 0.030104786157608032, "learning_rate": 3.6047024266784035e-07, "loss": 0.006, "num_tokens": 48824262.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5562759637832642, "sampling/importance_sampling_ratio/mean": 1.0002262592315674, "sampling/importance_sampling_ratio/min": 0.607857882976532, "sampling/sampling_logp_difference/max": 0.4978141784667969, "sampling/sampling_logp_difference/mean": 0.015647035092115402, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 232.296875, "completions/mean_terminated_length": 232.296875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.43514683842658997, "epoch": 1.8982843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.023241149598766817, "kl": 0.04113928601145744, "learning_rate": 3.5978630534699865e-07, "loss": 0.0004, "num_tokens": 48859977.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.692874550819397, "sampling/importance_sampling_ratio/mean": 1.0002148151397705, "sampling/importance_sampling_ratio/min": 0.6567551493644714, "sampling/sampling_logp_difference/max": 0.5264279842376709, "sampling/sampling_logp_difference/mean": 0.015478353947401047, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 225.265625, "completions/mean_terminated_length": 225.265625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.38786888122558594, "epoch": 1.8995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7722707835378005, "kl": 0.028615374118089676, "learning_rate": 3.591026526510742e-07, "loss": -0.0096, "num_tokens": 48895546.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2822431325912476, "sampling/importance_sampling_ratio/mean": 0.9998630881309509, "sampling/importance_sampling_ratio/min": 0.5943406224250793, "sampling/sampling_logp_difference/max": 0.5203027725219727, "sampling/sampling_logp_difference/mean": 0.013558058068156242, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.41978776454925537, "epoch": 1.9007352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.9088531463325852, "kl": 0.034309446811676025, "learning_rate": 3.584192859678391e-07, "loss": 0.0111, "num_tokens": 48927754.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3229866027832031, "sampling/importance_sampling_ratio/mean": 0.9999614357948303, "sampling/importance_sampling_ratio/min": 0.7101708650588989, "sampling/sampling_logp_difference/max": 0.3422497510910034, "sampling/sampling_logp_difference/mean": 0.014441236853599548, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 233.421875, "completions/mean_terminated_length": 233.421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.46079522371292114, "epoch": 1.9019607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.7475422481338134, "kl": 0.030957430601119995, "learning_rate": 3.577362066844838e-07, "loss": -0.0197, "num_tokens": 48960805.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.409862756729126, "sampling/importance_sampling_ratio/mean": 0.999723494052887, "sampling/importance_sampling_ratio/min": 0.636890172958374, "sampling/sampling_logp_difference/max": 0.4511580467224121, "sampling/sampling_logp_difference/mean": 0.016117535531520844, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 173.0625, "completions/mean_terminated_length": 173.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4062950611114502, "epoch": 1.903186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0208390156543426, "kl": 0.03014496900141239, "learning_rate": 3.570534161876163e-07, "loss": 0.0003, "num_tokens": 48986633.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3640347719192505, "sampling/importance_sampling_ratio/mean": 1.000274658203125, "sampling/importance_sampling_ratio/min": 0.6374650001525879, "sampling/sampling_logp_difference/max": 0.4502559304237366, "sampling/sampling_logp_difference/mean": 0.015687409788370132, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 246.109375, "completions/mean_terminated_length": 246.109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.387340247631073, "epoch": 1.9044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.015121762657977288, "kl": 0.025227148085832596, "learning_rate": 3.5637091586325796e-07, "loss": 0.0002, "num_tokens": 49025280.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6264692544937134, "sampling/importance_sampling_ratio/mean": 0.9998824596405029, "sampling/importance_sampling_ratio/min": 0.5149549841880798, "sampling/sampling_logp_difference/max": 0.6636757850646973, "sampling/sampling_logp_difference/mean": 0.014368398115038872, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 237.859375, "completions/mean_terminated_length": 237.859375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.39210081100463867, "epoch": 1.905637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.01640127218729267, "kl": 0.02905341237783432, "learning_rate": 3.556887070968414e-07, "loss": 0.0003, "num_tokens": 49058775.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.514710545539856, "sampling/importance_sampling_ratio/mean": 0.9997396469116211, "sampling/importance_sampling_ratio/min": 0.69562166929245, "sampling/sampling_logp_difference/max": 0.4152243137359619, "sampling/sampling_logp_difference/mean": 0.014183616265654564, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 239.984375, "completions/mean_terminated_length": 239.984375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.41489022970199585, "epoch": 1.906862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.018190887281895423, "kl": 0.03981224074959755, "learning_rate": 3.550067912732069e-07, "loss": 0.0004, "num_tokens": 49094774.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.63120436668396, "sampling/importance_sampling_ratio/mean": 0.9999634027481079, "sampling/importance_sampling_ratio/min": 0.6876845955848694, "sampling/sampling_logp_difference/max": 0.4893186092376709, "sampling/sampling_logp_difference/mean": 0.01440738607198, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 253.5625, "completions/mean_terminated_length": 253.5625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.43130192160606384, "epoch": 1.9080882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.7058363093624886, "kl": 0.025736529380083084, "learning_rate": 3.5432516977660054e-07, "loss": 0.0235, "num_tokens": 49128506.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996404647827148, "sampling/importance_sampling_ratio/min": 0.6482198238372803, "sampling/sampling_logp_difference/max": 0.7257037162780762, "sampling/sampling_logp_difference/mean": 0.014149620197713375, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 238.890625, "completions/mean_terminated_length": 238.890625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.361882746219635, "epoch": 1.909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01411134314914194, "kl": 0.023615509271621704, "learning_rate": 3.5364384399067094e-07, "loss": 0.0002, "num_tokens": 49161539.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5045721530914307, "sampling/importance_sampling_ratio/mean": 0.9995989203453064, "sampling/importance_sampling_ratio/min": 0.5740959048271179, "sampling/sampling_logp_difference/max": 0.5549588203430176, "sampling/sampling_logp_difference/mean": 0.013942791149020195, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 240.46875, "completions/mean_terminated_length": 240.46875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5358800292015076, "epoch": 1.9105392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8785223264113101, "kl": 0.0358688049018383, "learning_rate": 3.5296281529846593e-07, "loss": -0.0083, "num_tokens": 49207201.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6312074661254883, "sampling/importance_sampling_ratio/mean": 0.9999327659606934, "sampling/importance_sampling_ratio/min": 0.5513428449630737, "sampling/sampling_logp_difference/max": 0.5953984260559082, "sampling/sampling_logp_difference/mean": 0.01753925159573555, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 219.484375, "completions/mean_terminated_length": 219.484375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37377434968948364, "epoch": 1.9117647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 0.9434145076954223, "kl": 0.03735412657260895, "learning_rate": 3.5228208508243073e-07, "loss": 0.0157, "num_tokens": 49235648.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.667006015777588, "sampling/importance_sampling_ratio/mean": 0.9995557069778442, "sampling/importance_sampling_ratio/min": 0.6204636693000793, "sampling/sampling_logp_difference/max": 0.5110292434692383, "sampling/sampling_logp_difference/mean": 0.01359252817928791, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 239.734375, "completions/mean_terminated_length": 239.734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.32890066504478455, "epoch": 1.9129901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.1616256608714812, "kl": 0.03831387311220169, "learning_rate": 3.5160165472440467e-07, "loss": -0.0073, "num_tokens": 49268959.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3948997259140015, "sampling/importance_sampling_ratio/mean": 0.9996709823608398, "sampling/importance_sampling_ratio/min": 0.6452450156211853, "sampling/sampling_logp_difference/max": 0.4381251335144043, "sampling/sampling_logp_difference/mean": 0.012082546949386597, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.40179872512817383, "epoch": 1.9142156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.02303797620106814, "kl": 0.03159317001700401, "learning_rate": 3.509215256056183e-07, "loss": 0.0003, "num_tokens": 49299719.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3907690048217773, "sampling/importance_sampling_ratio/mean": 1.000664234161377, "sampling/importance_sampling_ratio/min": 0.6130973696708679, "sampling/sampling_logp_difference/max": 0.48923158645629883, "sampling/sampling_logp_difference/mean": 0.014336859807372093, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 238.03125, "completions/mean_terminated_length": 238.03125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3968772292137146, "epoch": 1.9154411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.3002040600877474, "kl": 0.02640102617442608, "learning_rate": 3.502416991066904e-07, "loss": -0.0395, "num_tokens": 49331577.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5946547985076904, "sampling/importance_sampling_ratio/mean": 0.9998785853385925, "sampling/importance_sampling_ratio/min": 0.6462288498878479, "sampling/sampling_logp_difference/max": 0.46665728092193604, "sampling/sampling_logp_difference/mean": 0.013803413137793541, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 243.5, "completions/mean_terminated_length": 243.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3707587718963623, "epoch": 1.9166666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.9164581318319811, "kl": 0.023630518466234207, "learning_rate": 3.495621766076259e-07, "loss": 0.0504, "num_tokens": 49365529.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5934795141220093, "sampling/importance_sampling_ratio/mean": 0.9997700452804565, "sampling/importance_sampling_ratio/min": 0.6277029514312744, "sampling/sampling_logp_difference/max": 0.46591997146606445, "sampling/sampling_logp_difference/mean": 0.014482099562883377, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 205.46875, "completions/mean_terminated_length": 205.46875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.37905019521713257, "epoch": 1.9178921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.037330464795994424, "kl": 0.07470554113388062, "learning_rate": 3.488829594878123e-07, "loss": 0.0006, "num_tokens": 49396087.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.656383752822876, "sampling/importance_sampling_ratio/mean": 0.9995221495628357, "sampling/importance_sampling_ratio/min": 0.6877167820930481, "sampling/sampling_logp_difference/max": 0.5046367645263672, "sampling/sampling_logp_difference/mean": 0.014606459997594357, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 214.5625, "completions/mean_terminated_length": 214.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4915218949317932, "epoch": 1.9191176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.089900933626398, "kl": 0.049614038318395615, "learning_rate": 3.4820404912601757e-07, "loss": 0.0057, "num_tokens": 49431371.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5227725505828857, "sampling/importance_sampling_ratio/mean": 1.0006821155548096, "sampling/importance_sampling_ratio/min": 0.6858685612678528, "sampling/sampling_logp_difference/max": 0.4205327033996582, "sampling/sampling_logp_difference/mean": 0.017097633332014084, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 192.4375, "completions/mean_terminated_length": 192.4375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.34200167655944824, "epoch": 1.920343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.016345475038858755, "kl": 0.029332738369703293, "learning_rate": 3.4752544690038643e-07, "loss": 0.0003, "num_tokens": 49460183.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6541374921798706, "sampling/importance_sampling_ratio/mean": 0.9999823570251465, "sampling/importance_sampling_ratio/min": 0.6625405550003052, "sampling/sampling_logp_difference/max": 0.5032796859741211, "sampling/sampling_logp_difference/mean": 0.013198770582675934, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 224.46875, "completions/mean_terminated_length": 224.46875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.5203051567077637, "epoch": 1.9215686274509802, "frac_reward_zero_std": 0.25, "grad_norm": 1.3166060363514769, "kl": 0.06350134313106537, "learning_rate": 3.468471541884385e-07, "loss": -0.0294, "num_tokens": 49489317.0, "reward": -0.125, "reward_std": 0.7023203372955322, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3266730308532715, "sampling/importance_sampling_ratio/mean": 0.9998144507408142, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.017328787595033646, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 246.65625, "completions/mean_terminated_length": 246.65625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4162110984325409, "epoch": 1.9227941176470589, "frac_reward_zero_std": 0.75, "grad_norm": 0.673909289737263, "kl": 0.03644551709294319, "learning_rate": 3.461691723670651e-07, "loss": 0.0134, "num_tokens": 49521119.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4777597188949585, "sampling/importance_sampling_ratio/mean": 1.0004178285598755, "sampling/importance_sampling_ratio/min": 0.6392900943756104, "sampling/sampling_logp_difference/max": 0.44739699363708496, "sampling/sampling_logp_difference/mean": 0.014774038456380367, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 170.734375, "completions/mean_terminated_length": 170.734375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3501443862915039, "epoch": 1.9240196078431373, "frac_reward_zero_std": 1.0, "grad_norm": 0.020873349081082473, "kl": 0.028426751494407654, "learning_rate": 3.454915028125263e-07, "loss": 0.0003, "num_tokens": 49549694.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4397555589675903, "sampling/importance_sampling_ratio/mean": 0.9996906518936157, "sampling/importance_sampling_ratio/min": 0.6401638984680176, "sampling/sampling_logp_difference/max": 0.4460310935974121, "sampling/sampling_logp_difference/mean": 0.014869427308440208, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 203.515625, "completions/mean_terminated_length": 203.515625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.46519067883491516, "epoch": 1.9252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.019422375818408243, "kl": 0.030188778415322304, "learning_rate": 3.4481414690044836e-07, "loss": 0.0003, "num_tokens": 49583199.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.432383418083191, "sampling/importance_sampling_ratio/mean": 1.0000834465026855, "sampling/importance_sampling_ratio/min": 0.6342064738273621, "sampling/sampling_logp_difference/max": 0.4553806781768799, "sampling/sampling_logp_difference/mean": 0.017122812569141388, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 214.484375, "completions/mean_terminated_length": 214.484375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3726535737514496, "epoch": 1.9264705882352942, "frac_reward_zero_std": 0.5, "grad_norm": 1.123537651624613, "kl": 0.026137355715036392, "learning_rate": 3.441371060058209e-07, "loss": 0.012, "num_tokens": 49615406.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.431039810180664, "sampling/importance_sampling_ratio/mean": 0.9996926784515381, "sampling/importance_sampling_ratio/min": 0.7300207614898682, "sampling/sampling_logp_difference/max": 0.3584012985229492, "sampling/sampling_logp_difference/mean": 0.01359387207776308, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.37289541959762573, "epoch": 1.9276960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.019952024297273645, "kl": 0.030565187335014343, "learning_rate": 3.4346038150299425e-07, "loss": 0.0003, "num_tokens": 49639806.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4249707460403442, "sampling/importance_sampling_ratio/mean": 0.9998721480369568, "sampling/importance_sampling_ratio/min": 0.607313334941864, "sampling/sampling_logp_difference/max": 0.49871039390563965, "sampling/sampling_logp_difference/mean": 0.015014609321951866, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 187.71875, "completions/mean_terminated_length": 187.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.39289116859436035, "epoch": 1.928921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.015109079161216534, "kl": 0.02577681839466095, "learning_rate": 3.427839747656758e-07, "loss": 0.0003, "num_tokens": 49671436.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4461145401000977, "sampling/importance_sampling_ratio/mean": 1.0000542402267456, "sampling/importance_sampling_ratio/min": 0.6255195140838623, "sampling/sampling_logp_difference/max": 0.46917271614074707, "sampling/sampling_logp_difference/mean": 0.014366412535309792, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 178.265625, "completions/mean_terminated_length": 178.265625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.389309823513031, "epoch": 1.9301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9804361746626181, "kl": 0.04585760459303856, "learning_rate": 3.4210788716692875e-07, "loss": -0.005, "num_tokens": 49698685.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3113781213760376, "sampling/importance_sampling_ratio/mean": 1.000306487083435, "sampling/importance_sampling_ratio/min": 0.6482202410697937, "sampling/sampling_logp_difference/max": 0.4335247278213501, "sampling/sampling_logp_difference/mean": 0.013438409194350243, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 198.78125, "completions/mean_terminated_length": 198.78125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.4965009391307831, "epoch": 1.9313725490196079, "frac_reward_zero_std": 0.75, "grad_norm": 1.1148114534308606, "kl": 0.031996533274650574, "learning_rate": 3.414321200791679e-07, "loss": -0.0272, "num_tokens": 49735903.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5280892848968506, "sampling/importance_sampling_ratio/mean": 1.0006356239318848, "sampling/importance_sampling_ratio/min": 0.7082610726356506, "sampling/sampling_logp_difference/max": 0.42401814460754395, "sampling/sampling_logp_difference/mean": 0.017043106257915497, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 182.671875, "completions/mean_terminated_length": 182.671875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.45260685682296753, "epoch": 1.9325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.023600819454342645, "kl": 0.03608972579240799, "learning_rate": 3.4075667487415785e-07, "loss": 0.0004, "num_tokens": 49768426.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6213918924331665, "sampling/importance_sampling_ratio/mean": 1.0005576610565186, "sampling/importance_sampling_ratio/min": 0.6077486276626587, "sampling/sampling_logp_difference/max": 0.49799394607543945, "sampling/sampling_logp_difference/mean": 0.01709265448153019, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 210.421875, "completions/mean_terminated_length": 210.421875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.41075772047042847, "epoch": 1.9338235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7296178305784208, "kl": 0.058899056166410446, "learning_rate": 3.4008155292300934e-07, "loss": -0.0186, "num_tokens": 49796725.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.326687216758728, "sampling/importance_sampling_ratio/mean": 1.000306248664856, "sampling/importance_sampling_ratio/min": 0.6764224171638489, "sampling/sampling_logp_difference/max": 0.39093756675720215, "sampling/sampling_logp_difference/mean": 0.016160249710083008, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 213.765625, "completions/mean_terminated_length": 213.765625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.37981435656547546, "epoch": 1.9350490196078431, "frac_reward_zero_std": 0.75, "grad_norm": 0.7461768884595513, "kl": 0.05328400433063507, "learning_rate": 3.3940675559617723e-07, "loss": 0.0033, "num_tokens": 49832646.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6485239267349243, "sampling/importance_sampling_ratio/mean": 0.9997677803039551, "sampling/importance_sampling_ratio/min": 0.6063785552978516, "sampling/sampling_logp_difference/max": 0.5002508163452148, "sampling/sampling_logp_difference/mean": 0.014991349540650845, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4077078104019165, "epoch": 1.9362745098039216, "frac_reward_zero_std": 1.0, "grad_norm": 0.021226211046230563, "kl": 0.03923317417502403, "learning_rate": 3.3873228426345757e-07, "loss": 0.0004, "num_tokens": 49857310.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3727229833602905, "sampling/importance_sampling_ratio/mean": 0.9997315406799316, "sampling/importance_sampling_ratio/min": 0.6232230067253113, "sampling/sampling_logp_difference/max": 0.4728507995605469, "sampling/sampling_logp_difference/mean": 0.01577834039926529, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 174.546875, "completions/mean_terminated_length": 174.546875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4120787978172302, "epoch": 1.9375, "frac_reward_zero_std": 1.0, "grad_norm": 0.10868653636032384, "kl": 0.09270511567592621, "learning_rate": 3.380581402939841e-07, "loss": 0.0007, "num_tokens": 49882097.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5144963264465332, "sampling/importance_sampling_ratio/mean": 0.9998019933700562, "sampling/importance_sampling_ratio/min": 0.6171509027481079, "sampling/sampling_logp_difference/max": 0.48264169692993164, "sampling/sampling_logp_difference/mean": 0.01620662957429886, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 219.890625, "completions/mean_terminated_length": 219.890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4209383726119995, "epoch": 1.9387254901960784, "frac_reward_zero_std": 0.5, "grad_norm": 1.1926904175844644, "kl": 0.032342009246349335, "learning_rate": 3.373843250562265e-07, "loss": 0.0034, "num_tokens": 49917338.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4694732427597046, "sampling/importance_sampling_ratio/mean": 0.9998754262924194, "sampling/importance_sampling_ratio/min": 0.6038687825202942, "sampling/sampling_logp_difference/max": 0.5043983459472656, "sampling/sampling_logp_difference/mean": 0.015703190118074417, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 186.390625, "completions/mean_terminated_length": 186.390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3987663984298706, "epoch": 1.9399509803921569, "frac_reward_zero_std": 1.0, "grad_norm": 0.023364620329624858, "kl": 0.06759518384933472, "learning_rate": 3.3671083991798697e-07, "loss": 0.0006, "num_tokens": 49944627.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.444628119468689, "sampling/importance_sampling_ratio/mean": 0.9996543526649475, "sampling/importance_sampling_ratio/min": 0.7106347680091858, "sampling/sampling_logp_difference/max": 0.36785197257995605, "sampling/sampling_logp_difference/mean": 0.015497813001275063, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 214.234375, "completions/mean_terminated_length": 214.234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3201386332511902, "epoch": 1.9411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.028574620573220313, "kl": 0.024369578808546066, "learning_rate": 3.360376862463978e-07, "loss": 0.0002, "num_tokens": 49972450.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5025506019592285, "sampling/importance_sampling_ratio/mean": 0.9998648166656494, "sampling/importance_sampling_ratio/min": 0.2960188686847687, "sampling/sampling_logp_difference/max": 1.217332124710083, "sampling/sampling_logp_difference/mean": 0.013632440008223057, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 187.03125, "completions/mean_terminated_length": 187.03125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3416188657283783, "epoch": 1.9424019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0162594117210751, "kl": 0.025658920407295227, "learning_rate": 3.3536486540791823e-07, "loss": 0.0003, "num_tokens": 50000116.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.54694402217865, "sampling/importance_sampling_ratio/mean": 1.0000746250152588, "sampling/importance_sampling_ratio/min": 0.7320797443389893, "sampling/sampling_logp_difference/max": 0.4362814426422119, "sampling/sampling_logp_difference/mean": 0.013598126359283924, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 218.421875, "completions/mean_terminated_length": 218.421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3883388638496399, "epoch": 1.9436274509803921, "frac_reward_zero_std": 0.75, "grad_norm": 0.8629936895714069, "kl": 0.031464897096157074, "learning_rate": 3.3469237876833187e-07, "loss": -0.0288, "num_tokens": 50035775.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.799018383026123, "sampling/importance_sampling_ratio/mean": 0.9999352097511292, "sampling/importance_sampling_ratio/min": 0.6191887855529785, "sampling/sampling_logp_difference/max": 0.5872411727905273, "sampling/sampling_logp_difference/mean": 0.014580667950212955, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.33257895708084106, "epoch": 1.9448529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.021657433703474765, "kl": 0.023327291011810303, "learning_rate": 3.340202276927442e-07, "loss": 0.0002, "num_tokens": 50066983.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.430035948753357, "sampling/importance_sampling_ratio/mean": 1.0003684759140015, "sampling/importance_sampling_ratio/min": 0.6523290276527405, "sampling/sampling_logp_difference/max": 0.42720627784729004, "sampling/sampling_logp_difference/mean": 0.014051834121346474, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 308.71875, "completions/mean_terminated_length": 308.71875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4223977029323578, "epoch": 1.946078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5606014676586772, "kl": 0.05478545278310776, "learning_rate": 3.333484135455792e-07, "loss": -0.0113, "num_tokens": 50106853.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5471713542938232, "sampling/importance_sampling_ratio/mean": 1.0003740787506104, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.014794318936765194, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 200.0625, "completions/mean_terminated_length": 200.0625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.4382716417312622, "epoch": 1.9473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.019181652014097182, "kl": 0.030908845365047455, "learning_rate": 3.326769376905769e-07, "loss": 0.0003, "num_tokens": 50153417.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6531814336776733, "sampling/importance_sampling_ratio/mean": 1.0004072189331055, "sampling/importance_sampling_ratio/min": 0.7394487857818604, "sampling/sampling_logp_difference/max": 0.5027015209197998, "sampling/sampling_logp_difference/mean": 0.016115907579660416, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 290.40625, "completions/mean_terminated_length": 290.40625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3952628970146179, "epoch": 1.9485294117647058, "frac_reward_zero_std": 0.75, "grad_norm": 0.6131906149656822, "kl": 0.036005325615406036, "learning_rate": 3.3200580149079083e-07, "loss": -0.0234, "num_tokens": 50195763.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 0.999909520149231, "sampling/importance_sampling_ratio/min": 0.7331599593162537, "sampling/sampling_logp_difference/max": 0.48689985275268555, "sampling/sampling_logp_difference/mean": 0.01339157484471798, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 210.515625, "completions/mean_terminated_length": 210.515625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4929760694503784, "epoch": 1.9497549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.1510936162821228, "kl": 0.040675677359104156, "learning_rate": 3.31335006308585e-07, "loss": -0.0104, "num_tokens": 50224868.0, "reward": 0.71875, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4317108392715454, "sampling/importance_sampling_ratio/mean": 0.9994465112686157, "sampling/importance_sampling_ratio/min": 0.6771494150161743, "sampling/sampling_logp_difference/max": 0.3898632526397705, "sampling/sampling_logp_difference/mean": 0.016803476959466934, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.43143031001091003, "epoch": 1.9509803921568627, "frac_reward_zero_std": 0.5, "grad_norm": 1.1763649983494817, "kl": 0.06657734513282776, "learning_rate": 3.3066455350563115e-07, "loss": -0.0035, "num_tokens": 50254300.0, "reward": 0.15625, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3829240798950195, "sampling/importance_sampling_ratio/mean": 0.9998020529747009, "sampling/importance_sampling_ratio/min": 0.6788738369941711, "sampling/sampling_logp_difference/max": 0.3873199224472046, "sampling/sampling_logp_difference/mean": 0.015550259500741959, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 221.65625, "completions/mean_terminated_length": 221.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4554600119590759, "epoch": 1.9522058823529411, "frac_reward_zero_std": 0.5, "grad_norm": 1.0336284950443926, "kl": 0.050704628229141235, "learning_rate": 3.29994444442906e-07, "loss": -0.0701, "num_tokens": 50286406.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6007791757583618, "sampling/importance_sampling_ratio/mean": 0.9997819066047668, "sampling/importance_sampling_ratio/min": 0.6097137331962585, "sampling/sampling_logp_difference/max": 0.4947657585144043, "sampling/sampling_logp_difference/mean": 0.016511352732777596, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 263.359375, "completions/mean_terminated_length": 263.359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.513678252696991, "epoch": 1.9534313725490198, "frac_reward_zero_std": 0.25, "grad_norm": 1.2700148200836998, "kl": 0.06838428229093552, "learning_rate": 3.2932468048068836e-07, "loss": 0.0475, "num_tokens": 50322765.0, "reward": 0.4375, "reward_std": 0.6267197132110596, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4775911569595337, "sampling/importance_sampling_ratio/mean": 0.9996185898780823, "sampling/importance_sampling_ratio/min": 0.6207948327064514, "sampling/sampling_logp_difference/max": 0.47675466537475586, "sampling/sampling_logp_difference/mean": 0.01690804958343506, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 248.203125, "completions/mean_terminated_length": 248.203125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2967587113380432, "epoch": 1.954656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.7413381994218197, "kl": 0.03382870927453041, "learning_rate": 3.2865526297855694e-07, "loss": 0.0012, "num_tokens": 50359994.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4068033695220947, "sampling/importance_sampling_ratio/mean": 1.000199317932129, "sampling/importance_sampling_ratio/min": 0.6435267925262451, "sampling/sampling_logp_difference/max": 0.44079160690307617, "sampling/sampling_logp_difference/mean": 0.011207588016986847, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 194.9375, "completions/mean_terminated_length": 194.9375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.41791579127311707, "epoch": 1.9558823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.9452934538935908, "kl": 0.03728911280632019, "learning_rate": 3.2798619329538646e-07, "loss": 0.0215, "num_tokens": 50388630.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3477599620819092, "sampling/importance_sampling_ratio/mean": 1.0002269744873047, "sampling/importance_sampling_ratio/min": 0.6202828288078308, "sampling/sampling_logp_difference/max": 0.47757983207702637, "sampling/sampling_logp_difference/mean": 0.016275346279144287, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 233.484375, "completions/mean_terminated_length": 233.484375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.5423104763031006, "epoch": 1.9571078431372548, "frac_reward_zero_std": 0.25, "grad_norm": 1.3498248195388074, "kl": 0.062157321721315384, "learning_rate": 3.2731747278934623e-07, "loss": -0.0171, "num_tokens": 50423957.0, "reward": 0.375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5467382669448853, "sampling/importance_sampling_ratio/mean": 0.9999659061431885, "sampling/importance_sampling_ratio/min": 0.6392384171485901, "sampling/sampling_logp_difference/max": 0.4474778175354004, "sampling/sampling_logp_difference/mean": 0.01773090660572052, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 247.96875, "completions/mean_terminated_length": 247.96875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.40003079175949097, "epoch": 1.9583333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.689370522366843, "kl": 0.034257661551237106, "learning_rate": 3.266491028178964e-07, "loss": 0.0228, "num_tokens": 50458819.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.549085021018982, "sampling/importance_sampling_ratio/mean": 1.0001552104949951, "sampling/importance_sampling_ratio/min": 0.6638208627700806, "sampling/sampling_logp_difference/max": 0.4376645088195801, "sampling/sampling_logp_difference/mean": 0.01411025132983923, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 218.484375, "completions/mean_terminated_length": 218.484375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3510703444480896, "epoch": 1.9595588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.011985749171011403, "kl": 0.020307883620262146, "learning_rate": 3.2598108473778595e-07, "loss": 0.0002, "num_tokens": 50490194.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4143846035003662, "sampling/importance_sampling_ratio/mean": 1.0001227855682373, "sampling/importance_sampling_ratio/min": 0.6795847415924072, "sampling/sampling_logp_difference/max": 0.3862733840942383, "sampling/sampling_logp_difference/mean": 0.01346974354237318, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4613341987133026, "epoch": 1.9607843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7638189527870977, "kl": 0.05117883160710335, "learning_rate": 3.253134199050489e-07, "loss": 0.0099, "num_tokens": 50523026.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.527489423751831, "sampling/importance_sampling_ratio/mean": 0.9998858571052551, "sampling/importance_sampling_ratio/min": 0.6256482601165771, "sampling/sampling_logp_difference/max": 0.4689669609069824, "sampling/sampling_logp_difference/mean": 0.015857355669140816, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.40186768770217896, "epoch": 1.9620098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.015758176314101594, "kl": 0.02447192184627056, "learning_rate": 3.2464610967500273e-07, "loss": 0.0002, "num_tokens": 50558434.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6482127904891968, "sampling/importance_sampling_ratio/mean": 1.000008225440979, "sampling/importance_sampling_ratio/min": 0.6263538002967834, "sampling/sampling_logp_difference/max": 0.4996914863586426, "sampling/sampling_logp_difference/mean": 0.014021730050444603, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 261.59375, "completions/mean_terminated_length": 261.59375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3546887934207916, "epoch": 1.9632352941176472, "frac_reward_zero_std": 0.75, "grad_norm": 0.7056207683313288, "kl": 0.04179853945970535, "learning_rate": 3.239791554022449e-07, "loss": 0.0169, "num_tokens": 50593960.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.441324234008789, "sampling/importance_sampling_ratio/mean": 1.000085473060608, "sampling/importance_sampling_ratio/min": 0.6648438572883606, "sampling/sampling_logp_difference/max": 0.408203125, "sampling/sampling_logp_difference/mean": 0.012519138865172863, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 227.9375, "completions/mean_terminated_length": 227.9375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3909626603126526, "epoch": 1.9644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.01686007611299536, "kl": 0.02554922364652157, "learning_rate": 3.233125584406505e-07, "loss": 0.0002, "num_tokens": 50627668.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5972938537597656, "sampling/importance_sampling_ratio/mean": 0.9998860359191895, "sampling/importance_sampling_ratio/min": 0.6601378917694092, "sampling/sampling_logp_difference/max": 0.4683108329772949, "sampling/sampling_logp_difference/mean": 0.014788507483899593, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 214.71875, "completions/mean_terminated_length": 214.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.34474942088127136, "epoch": 1.965686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.015010132983517764, "kl": 0.025028621777892113, "learning_rate": 3.226463201433688e-07, "loss": 0.0002, "num_tokens": 50661570.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995816349983215, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.9235885143280029, "sampling/sampling_logp_difference/mean": 0.013952052220702171, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 218.21875, "completions/mean_terminated_length": 218.21875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4414424002170563, "epoch": 1.9669117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.018974998262863217, "kl": 0.03201860934495926, "learning_rate": 3.219804418628216e-07, "loss": 0.0003, "num_tokens": 50695936.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3888485431671143, "sampling/importance_sampling_ratio/mean": 1.0003587007522583, "sampling/importance_sampling_ratio/min": 0.755850613117218, "sampling/sampling_logp_difference/max": 0.3284749984741211, "sampling/sampling_logp_difference/mean": 0.01593206077814102, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 196.203125, "completions/mean_terminated_length": 196.203125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4284861385822296, "epoch": 1.968137254901961, "frac_reward_zero_std": 0.5, "grad_norm": 1.290646620478447, "kl": 0.03602099046111107, "learning_rate": 3.2131492495069965e-07, "loss": -0.0239, "num_tokens": 50729741.0, "reward": -0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.396368145942688, "sampling/importance_sampling_ratio/mean": 1.0005464553833008, "sampling/importance_sampling_ratio/min": 0.7253676652908325, "sampling/sampling_logp_difference/max": 0.3338747024536133, "sampling/sampling_logp_difference/mean": 0.01478176936507225, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 252.390625, "completions/mean_terminated_length": 252.390625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.39664268493652344, "epoch": 1.969362745098039, "frac_reward_zero_std": 0.5, "grad_norm": 1.0307934191508574, "kl": 0.025808095932006836, "learning_rate": 3.206497707579598e-07, "loss": -0.0956, "num_tokens": 50765318.0, "reward": 0.53125, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3490233421325684, "sampling/importance_sampling_ratio/mean": 0.9996511936187744, "sampling/importance_sampling_ratio/min": 0.7528896927833557, "sampling/sampling_logp_difference/max": 0.2993807792663574, "sampling/sampling_logp_difference/mean": 0.013657033443450928, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 180.984375, "completions/mean_terminated_length": 180.984375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.37230825424194336, "epoch": 1.9705882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.017144080067726417, "kl": 0.03063638135790825, "learning_rate": 3.199849806348233e-07, "loss": 0.0003, "num_tokens": 50795701.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5719265937805176, "sampling/importance_sampling_ratio/mean": 0.9995201826095581, "sampling/importance_sampling_ratio/min": 0.645332932472229, "sampling/sampling_logp_difference/max": 0.4523019790649414, "sampling/sampling_logp_difference/mean": 0.014999780803918839, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 168.40625, "completions/mean_terminated_length": 168.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4284522831439972, "epoch": 1.971813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.9406603697428186, "kl": 0.03776994347572327, "learning_rate": 3.1932055593077166e-07, "loss": 0.0168, "num_tokens": 50821407.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4775913953781128, "sampling/importance_sampling_ratio/mean": 1.0000436305999756, "sampling/importance_sampling_ratio/min": 0.6637405157089233, "sampling/sampling_logp_difference/max": 0.4098639488220215, "sampling/sampling_logp_difference/mean": 0.016887390986084938, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 201.15625, "completions/mean_terminated_length": 201.15625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.44030991196632385, "epoch": 1.9730392156862746, "frac_reward_zero_std": 0.5, "grad_norm": 1.346565925620663, "kl": 0.044444017112255096, "learning_rate": 3.186564979945453e-07, "loss": -0.024, "num_tokens": 50853561.0, "reward": 0.5, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7065656185150146, "sampling/importance_sampling_ratio/mean": 1.0007504224777222, "sampling/importance_sampling_ratio/min": 0.6331958770751953, "sampling/sampling_logp_difference/max": 0.5344829559326172, "sampling/sampling_logp_difference/mean": 0.016215015202760696, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 241.515625, "completions/mean_terminated_length": 241.515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3774697482585907, "epoch": 1.9742647058823528, "frac_reward_zero_std": 0.75, "grad_norm": 1.0093232935662737, "kl": 0.0236565750092268, "learning_rate": 3.179928081741394e-07, "loss": -0.0086, "num_tokens": 50895850.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3670462369918823, "sampling/importance_sampling_ratio/mean": 1.000116229057312, "sampling/importance_sampling_ratio/min": 0.604745626449585, "sampling/sampling_logp_difference/max": 0.5029473304748535, "sampling/sampling_logp_difference/mean": 0.013629928231239319, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 220.53125, "completions/mean_terminated_length": 220.53125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.46636515855789185, "epoch": 1.9754901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7741735253856575, "kl": 0.033601656556129456, "learning_rate": 3.173294878168025e-07, "loss": -0.0036, "num_tokens": 50926812.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5776169300079346, "sampling/importance_sampling_ratio/mean": 0.9993590712547302, "sampling/importance_sampling_ratio/min": 0.6298578381538391, "sampling/sampling_logp_difference/max": 0.4622611999511719, "sampling/sampling_logp_difference/mean": 0.016215326264500618, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4220837354660034, "epoch": 1.9767156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.03491036981233961, "kl": 0.03890404850244522, "learning_rate": 3.166665382690327e-07, "loss": 0.0004, "num_tokens": 50958988.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6628754138946533, "sampling/importance_sampling_ratio/mean": 0.9996306896209717, "sampling/importance_sampling_ratio/min": 0.3209123909473419, "sampling/sampling_logp_difference/max": 1.1365870237350464, "sampling/sampling_logp_difference/mean": 0.016190893948078156, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 211.5625, "completions/mean_terminated_length": 211.5625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3358461558818817, "epoch": 1.9779411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.014874952855669052, "kl": 0.023703187704086304, "learning_rate": 3.1600396087657586e-07, "loss": 0.0002, "num_tokens": 50988080.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2980740070343018, "sampling/importance_sampling_ratio/mean": 0.9990770220756531, "sampling/importance_sampling_ratio/min": 0.661005973815918, "sampling/sampling_logp_difference/max": 0.41399240493774414, "sampling/sampling_logp_difference/mean": 0.013611245900392532, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 223.515625, "completions/mean_terminated_length": 223.515625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.39452552795410156, "epoch": 1.9791666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.8423314298277232, "kl": 0.03638045862317085, "learning_rate": 3.153417569844219e-07, "loss": -0.0026, "num_tokens": 51024689.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5418307781219482, "sampling/importance_sampling_ratio/mean": 0.9999921321868896, "sampling/importance_sampling_ratio/min": 0.6482195258140564, "sampling/sampling_logp_difference/max": 0.43352580070495605, "sampling/sampling_logp_difference/mean": 0.0139867402613163, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 184.890625, "completions/mean_terminated_length": 184.890625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5079770088195801, "epoch": 1.9803921568627452, "frac_reward_zero_std": 1.0, "grad_norm": 0.030758759284119606, "kl": 0.058124691247940063, "learning_rate": 3.1467992793680267e-07, "loss": 0.0006, "num_tokens": 51060714.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4924474954605103, "sampling/importance_sampling_ratio/mean": 0.9997177124023438, "sampling/importance_sampling_ratio/min": 0.7062594890594482, "sampling/sampling_logp_difference/max": 0.4004173278808594, "sampling/sampling_logp_difference/mean": 0.017262417823076248, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 166.015625, "completions/mean_terminated_length": 166.015625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.42368459701538086, "epoch": 1.9816176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.9664555440161905, "kl": 0.04469449445605278, "learning_rate": 3.140184750771895e-07, "loss": 0.0118, "num_tokens": 51088299.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.406961441040039, "sampling/importance_sampling_ratio/mean": 0.9997423887252808, "sampling/importance_sampling_ratio/min": 0.6152718663215637, "sampling/sampling_logp_difference/max": 0.4856910705566406, "sampling/sampling_logp_difference/mean": 0.015562538057565689, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 184.421875, "completions/mean_terminated_length": 184.421875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.43028944730758667, "epoch": 1.982843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8922780422666251, "kl": 0.043754082173109055, "learning_rate": 3.133573997482896e-07, "loss": 0.0193, "num_tokens": 51123302.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4377474784851074, "sampling/importance_sampling_ratio/mean": 0.9996957182884216, "sampling/importance_sampling_ratio/min": 0.6646489500999451, "sampling/sampling_logp_difference/max": 0.40849626064300537, "sampling/sampling_logp_difference/mean": 0.015855390578508377, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 190.28125, "completions/mean_terminated_length": 190.28125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.416605681180954, "epoch": 1.9840686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.01691404971353703, "kl": 0.027441244572401047, "learning_rate": 3.1269670329204393e-07, "loss": 0.0003, "num_tokens": 51155560.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6105904579162598, "sampling/importance_sampling_ratio/mean": 1.0004615783691406, "sampling/importance_sampling_ratio/min": 0.6894065141677856, "sampling/sampling_logp_difference/max": 0.47660088539123535, "sampling/sampling_logp_difference/mean": 0.015048052184283733, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3821426033973694, "epoch": 1.9852941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.017578685537538653, "kl": 0.03149223327636719, "learning_rate": 3.1203638704962465e-07, "loss": 0.0003, "num_tokens": 51183060.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4779694080352783, "sampling/importance_sampling_ratio/mean": 0.9998247623443604, "sampling/importance_sampling_ratio/min": 0.662719190120697, "sampling/sampling_logp_difference/max": 0.4114038944244385, "sampling/sampling_logp_difference/mean": 0.014318181201815605, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 216.84375, "completions/mean_terminated_length": 216.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.48681575059890747, "epoch": 1.9865196078431373, "frac_reward_zero_std": 0.75, "grad_norm": 0.5712534163066981, "kl": 0.06585294008255005, "learning_rate": 3.11376452361432e-07, "loss": -0.0004, "num_tokens": 51211962.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6226540803909302, "sampling/importance_sampling_ratio/mean": 0.9997466206550598, "sampling/importance_sampling_ratio/min": 0.7015644907951355, "sampling/sampling_logp_difference/max": 0.48406314849853516, "sampling/sampling_logp_difference/mean": 0.017295796424150467, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 188.828125, "completions/mean_terminated_length": 188.828125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.3920605480670929, "epoch": 1.9877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01813063579027912, "kl": 0.028277505189180374, "learning_rate": 3.107169005670912e-07, "loss": 0.0003, "num_tokens": 51239343.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.439265489578247, "sampling/importance_sampling_ratio/mean": 1.0000557899475098, "sampling/importance_sampling_ratio/min": 0.4821047782897949, "sampling/sampling_logp_difference/max": 0.7295938730239868, "sampling/sampling_logp_difference/mean": 0.014987459406256676, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 164.34375, "completions/mean_terminated_length": 164.34375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.48923131823539734, "epoch": 1.9889705882352942, "frac_reward_zero_std": 0.75, "grad_norm": 1.0667513202962708, "kl": 0.06547729671001434, "learning_rate": 3.100577330054508e-07, "loss": -0.0016, "num_tokens": 51270677.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4297629594802856, "sampling/importance_sampling_ratio/mean": 0.9998574256896973, "sampling/importance_sampling_ratio/min": 0.6807543635368347, "sampling/sampling_logp_difference/max": 0.38455379009246826, "sampling/sampling_logp_difference/mean": 0.016909096390008926, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 195.59375, "completions/mean_terminated_length": 195.59375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.5149567127227783, "epoch": 1.9901960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.3214360322573169, "kl": 0.04750575125217438, "learning_rate": 3.0939895101457914e-07, "loss": -0.0654, "num_tokens": 51298539.0, "reward": 0.28125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.426176905632019, "sampling/importance_sampling_ratio/mean": 1.0000321865081787, "sampling/importance_sampling_ratio/min": 0.6894845962524414, "sampling/sampling_logp_difference/max": 0.3718109130859375, "sampling/sampling_logp_difference/mean": 0.016956638544797897, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.273445725440979, "epoch": 1.991421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.011491568779886541, "kl": 0.02013445645570755, "learning_rate": 3.087405559317622e-07, "loss": 0.0002, "num_tokens": 51333431.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4298361539840698, "sampling/importance_sampling_ratio/mean": 0.9997339844703674, "sampling/importance_sampling_ratio/min": 0.6772682666778564, "sampling/sampling_logp_difference/max": 0.38968777656555176, "sampling/sampling_logp_difference/mean": 0.010294242762029171, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 220.265625, "completions/mean_terminated_length": 220.265625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4093412458896637, "epoch": 1.9926470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.7499495984707397, "kl": 0.027860896661877632, "learning_rate": 3.0808254909349986e-07, "loss": 0.007, "num_tokens": 51364552.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3708269596099854, "sampling/importance_sampling_ratio/mean": 0.9997884631156921, "sampling/importance_sampling_ratio/min": 0.6372929811477661, "sampling/sampling_logp_difference/max": 0.45052576065063477, "sampling/sampling_logp_difference/mean": 0.013881012797355652, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 228.546875, "completions/mean_terminated_length": 228.546875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.43547090888023376, "epoch": 1.9938725490196079, "frac_reward_zero_std": 1.0, "grad_norm": 0.01597495877420292, "kl": 0.030936989933252335, "learning_rate": 3.0742493183550454e-07, "loss": 0.0003, "num_tokens": 51401035.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2765107154846191, "sampling/importance_sampling_ratio/mean": 1.000434398651123, "sampling/importance_sampling_ratio/min": 0.4848077595233917, "sampling/sampling_logp_difference/max": 0.7240028381347656, "sampling/sampling_logp_difference/mean": 0.01582859829068184, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 260.921875, "completions/mean_terminated_length": 260.921875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.5362026691436768, "epoch": 1.9950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.8660097690633888, "kl": 0.061552420258522034, "learning_rate": 3.0676770549269786e-07, "loss": -0.0167, "num_tokens": 51438982.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.423123836517334, "sampling/importance_sampling_ratio/mean": 0.9998515248298645, "sampling/importance_sampling_ratio/min": 0.5836427211761475, "sampling/sampling_logp_difference/max": 0.538466215133667, "sampling/sampling_logp_difference/mean": 0.01750752329826355, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 189.15625, "completions/mean_terminated_length": 189.15625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.44771260023117065, "epoch": 1.9963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.017847464088607986, "kl": 0.030776165425777435, "learning_rate": 3.0611087139920717e-07, "loss": 0.0003, "num_tokens": 51467808.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4113706350326538, "sampling/importance_sampling_ratio/mean": 1.0004737377166748, "sampling/importance_sampling_ratio/min": 0.6768690943717957, "sampling/sampling_logp_difference/max": 0.3902773857116699, "sampling/sampling_logp_difference/mean": 0.01701674610376358, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 186.859375, "completions/mean_terminated_length": 186.859375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.35539400577545166, "epoch": 1.9975490196078431, "frac_reward_zero_std": 1.0, "grad_norm": 0.015409080874464456, "kl": 0.025421861559152603, "learning_rate": 3.054544308883643e-07, "loss": 0.0002, "num_tokens": 51499031.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.459579348564148, "sampling/importance_sampling_ratio/mean": 1.0004053115844727, "sampling/importance_sampling_ratio/min": 0.6269775629043579, "sampling/sampling_logp_difference/max": 0.4668445587158203, "sampling/sampling_logp_difference/mean": 0.012880798429250717, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 207.546875, "completions/mean_terminated_length": 207.546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4121703505516052, "epoch": 1.9987745098039216, "frac_reward_zero_std": 0.75, "grad_norm": 0.8926295743082454, "kl": 0.02864101715385914, "learning_rate": 3.0479838529270186e-07, "loss": 0.0312, "num_tokens": 51529402.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.9235390424728394, "sampling/importance_sampling_ratio/mean": 1.000265121459961, "sampling/importance_sampling_ratio/min": 0.5656784772872925, "sampling/sampling_logp_difference/max": 0.6541666984558105, "sampling/sampling_logp_difference/mean": 0.014927449636161327, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 206.34375, "completions/mean_terminated_length": 206.34375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.4023796021938324, "epoch": 2.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.7346457549800537, "kl": 0.03533341735601425, "learning_rate": 3.0414273594395103e-07, "loss": 0.0126, "num_tokens": 51561392.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4619437456130981, "sampling/importance_sampling_ratio/mean": 1.0000967979431152, "sampling/importance_sampling_ratio/min": 0.779944658279419, "sampling/sampling_logp_difference/max": 0.37976694107055664, "sampling/sampling_logp_difference/mean": 0.014451739378273487, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 226.828125, "completions/mean_terminated_length": 226.828125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4342731237411499, "epoch": 2.0012254901960786, "frac_reward_zero_std": 0.75, "grad_norm": 0.7353459988915481, "kl": 0.03901055082678795, "learning_rate": 3.034874841730382e-07, "loss": -0.0565, "num_tokens": 51597749.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.47286856174469, "sampling/importance_sampling_ratio/mean": 0.9998716711997986, "sampling/importance_sampling_ratio/min": 0.6752111911773682, "sampling/sampling_logp_difference/max": 0.3927297592163086, "sampling/sampling_logp_difference/mean": 0.014761561527848244, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 183.078125, "completions/mean_terminated_length": 183.078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.48680412769317627, "epoch": 2.002450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.1911633868036358, "kl": 0.07100534439086914, "learning_rate": 3.0283263131008307e-07, "loss": -0.0056, "num_tokens": 51627562.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4876328706741333, "sampling/importance_sampling_ratio/mean": 0.9998986124992371, "sampling/importance_sampling_ratio/min": 0.192392960190773, "sampling/sampling_logp_difference/max": 1.6482152938842773, "sampling/sampling_logp_difference/mean": 0.01730552315711975, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 187.890625, "completions/mean_terminated_length": 187.890625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.47430655360221863, "epoch": 2.0036764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.021617528998024542, "kl": 0.040886953473091125, "learning_rate": 3.0217817868439545e-07, "loss": 0.0004, "num_tokens": 51654211.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4650894403457642, "sampling/importance_sampling_ratio/mean": 1.0000306367874146, "sampling/importance_sampling_ratio/min": 0.7136863470077515, "sampling/sampling_logp_difference/max": 0.3819162845611572, "sampling/sampling_logp_difference/mean": 0.01563193090260029, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 183.796875, "completions/mean_terminated_length": 183.796875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.37739595770835876, "epoch": 2.0049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.01592482339486375, "kl": 0.028131265193223953, "learning_rate": 3.015241276244729e-07, "loss": 0.0003, "num_tokens": 51683190.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.44826340675354, "sampling/importance_sampling_ratio/mean": 0.9998368620872498, "sampling/importance_sampling_ratio/min": 0.6941367387771606, "sampling/sampling_logp_difference/max": 0.3703651428222656, "sampling/sampling_logp_difference/mean": 0.01402178406715393, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 213.1875, "completions/mean_terminated_length": 213.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.34953129291534424, "epoch": 2.0061274509803924, "frac_reward_zero_std": 1.0, "grad_norm": 0.01891633922524575, "kl": 0.02821243926882744, "learning_rate": 3.0087047945799724e-07, "loss": 0.0003, "num_tokens": 51711378.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3256299495697021, "sampling/importance_sampling_ratio/mean": 0.9995253682136536, "sampling/importance_sampling_ratio/min": 0.6327881217002869, "sampling/sampling_logp_difference/max": 0.45761966705322266, "sampling/sampling_logp_difference/mean": 0.014304942451417446, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 218.6875, "completions/mean_terminated_length": 218.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.37606731057167053, "epoch": 2.0073529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.2220779175234928, "kl": 0.0383443757891655, "learning_rate": 3.002172355118331e-07, "loss": 0.0208, "num_tokens": 51745150.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6290236711502075, "sampling/importance_sampling_ratio/mean": 0.9998230934143066, "sampling/importance_sampling_ratio/min": 0.6105685234069824, "sampling/sampling_logp_difference/max": 0.4933648109436035, "sampling/sampling_logp_difference/mean": 0.014038465917110443, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 225.203125, "completions/mean_terminated_length": 225.203125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.5388938188552856, "epoch": 2.008578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.0824000288761453, "kl": 0.05878060311079025, "learning_rate": 2.995643971120243e-07, "loss": -0.011, "num_tokens": 51778683.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5637226104736328, "sampling/importance_sampling_ratio/mean": 1.0000779628753662, "sampling/importance_sampling_ratio/min": 0.3701205253601074, "sampling/sampling_logp_difference/max": 0.9939265847206116, "sampling/sampling_logp_difference/mean": 0.01762264594435692, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 176.21875, "completions/mean_terminated_length": 176.21875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.40344706177711487, "epoch": 2.0098039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.9937570432346224, "kl": 0.04136580601334572, "learning_rate": 2.9891196558379126e-07, "loss": 0.0262, "num_tokens": 51807257.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4272184371948242, "sampling/importance_sampling_ratio/mean": 1.000483751296997, "sampling/importance_sampling_ratio/min": 0.6946358680725098, "sampling/sampling_logp_difference/max": 0.3643674850463867, "sampling/sampling_logp_difference/mean": 0.01589183695614338, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 191.734375, "completions/mean_terminated_length": 191.734375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3914809823036194, "epoch": 2.011029411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.018623118788042722, "kl": 0.030127104371786118, "learning_rate": 2.9825994225152884e-07, "loss": 0.0003, "num_tokens": 51835816.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.651085376739502, "sampling/importance_sampling_ratio/mean": 0.9999833703041077, "sampling/importance_sampling_ratio/min": 0.617863655090332, "sampling/sampling_logp_difference/max": 0.5014328956604004, "sampling/sampling_logp_difference/mean": 0.014743650332093239, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 224.09375, "completions/mean_terminated_length": 224.09375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4045575261116028, "epoch": 2.0122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01935921996649146, "kl": 0.04010814055800438, "learning_rate": 2.976083284388031e-07, "loss": 0.0004, "num_tokens": 51868526.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7385188341140747, "sampling/importance_sampling_ratio/mean": 0.9998279213905334, "sampling/importance_sampling_ratio/min": 0.6948853731155396, "sampling/sampling_logp_difference/max": 0.5530334711074829, "sampling/sampling_logp_difference/mean": 0.015155438333749771, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 268.265625, "completions/mean_terminated_length": 268.265625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4279055595397949, "epoch": 2.013480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7053391349246472, "kl": 0.039207860827445984, "learning_rate": 2.9695712546834885e-07, "loss": -0.0033, "num_tokens": 51913343.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.371711254119873, "sampling/importance_sampling_ratio/mean": 0.9999764561653137, "sampling/importance_sampling_ratio/min": 0.6298378109931946, "sampling/sampling_logp_difference/max": 0.4622929096221924, "sampling/sampling_logp_difference/mean": 0.01395304873585701, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 224.65625, "completions/mean_terminated_length": 224.65625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3981042206287384, "epoch": 2.014705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01815549718527519, "kl": 0.029402870684862137, "learning_rate": 2.9630633466206655e-07, "loss": 0.0003, "num_tokens": 51948761.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3368773460388184, "sampling/importance_sampling_ratio/mean": 1.0000650882720947, "sampling/importance_sampling_ratio/min": 0.6909277439117432, "sampling/sampling_logp_difference/max": 0.3697199821472168, "sampling/sampling_logp_difference/mean": 0.01399196032434702, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 190.328125, "completions/mean_terminated_length": 190.328125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.44557228684425354, "epoch": 2.0159313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.9642542086273165, "kl": 0.032532282173633575, "learning_rate": 2.9565595734102043e-07, "loss": -0.0098, "num_tokens": 51978094.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.607919692993164, "sampling/importance_sampling_ratio/mean": 1.0002827644348145, "sampling/importance_sampling_ratio/min": 0.6513573527336121, "sampling/sampling_logp_difference/max": 0.4749412536621094, "sampling/sampling_logp_difference/mean": 0.015849046409130096, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 184.046875, "completions/mean_terminated_length": 184.046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3923611044883728, "epoch": 2.017156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 1.1044367314621921, "kl": 0.03323255106806755, "learning_rate": 2.950059948254355e-07, "loss": -0.0253, "num_tokens": 52007409.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4352164268493652, "sampling/importance_sampling_ratio/mean": 1.0003793239593506, "sampling/importance_sampling_ratio/min": 0.6964589953422546, "sampling/sampling_logp_difference/max": 0.36174631118774414, "sampling/sampling_logp_difference/mean": 0.014819534495472908, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 243.40625, "completions/mean_terminated_length": 243.40625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.37322890758514404, "epoch": 2.0183823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.016663914336851788, "kl": 0.025287559255957603, "learning_rate": 2.943564484346943e-07, "loss": 0.0002, "num_tokens": 52043099.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003478527069092, "sampling/importance_sampling_ratio/min": 0.11109285801649094, "sampling/sampling_logp_difference/max": 2.1973888874053955, "sampling/sampling_logp_difference/mean": 0.014352137222886086, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 200.421875, "completions/mean_terminated_length": 200.421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.44276976585388184, "epoch": 2.019607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.9201914178971083, "kl": 0.036136072129011154, "learning_rate": 2.937073194873348e-07, "loss": 0.0091, "num_tokens": 52074758.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5517765283584595, "sampling/importance_sampling_ratio/mean": 1.0003318786621094, "sampling/importance_sampling_ratio/min": 0.6313734650611877, "sampling/sampling_logp_difference/max": 0.459857702255249, "sampling/sampling_logp_difference/mean": 0.016168639063835144, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 265.453125, "completions/mean_terminated_length": 265.453125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.55025714635849, "epoch": 2.0208333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 0.9591424855913173, "kl": 0.056286685168743134, "learning_rate": 2.930586093010477e-07, "loss": 0.0432, "num_tokens": 52107523.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.531273603439331, "sampling/importance_sampling_ratio/mean": 0.9996803998947144, "sampling/importance_sampling_ratio/min": 0.6929144859313965, "sampling/sampling_logp_difference/max": 0.4260997772216797, "sampling/sampling_logp_difference/mean": 0.016547497361898422, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 193.15625, "completions/mean_terminated_length": 193.15625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.43626347184181213, "epoch": 2.0220588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.05836585662966822, "kl": 0.05067119002342224, "learning_rate": 2.9241031919267363e-07, "loss": 0.0005, "num_tokens": 52133965.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4176301956176758, "sampling/importance_sampling_ratio/mean": 0.9999895095825195, "sampling/importance_sampling_ratio/min": 0.2822890281677246, "sampling/sampling_logp_difference/max": 1.2648237943649292, "sampling/sampling_logp_difference/mean": 0.01583799347281456, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 275.4375, "completions/mean_terminated_length": 275.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.39282792806625366, "epoch": 2.0232843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.6110182406368218, "kl": 0.02741451933979988, "learning_rate": 2.917624504782006e-07, "loss": 0.0202, "num_tokens": 52176137.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998013377189636, "sampling/importance_sampling_ratio/min": 0.6771509051322937, "sampling/sampling_logp_difference/max": 1.1542718410491943, "sampling/sampling_logp_difference/mean": 0.013893187046051025, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 200.640625, "completions/mean_terminated_length": 200.640625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.390971302986145, "epoch": 2.0245098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.029704465085714463, "kl": 0.03557724878191948, "learning_rate": 2.911150044727605e-07, "loss": 0.0003, "num_tokens": 52212066.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5323922634124756, "sampling/importance_sampling_ratio/mean": 1.0003458261489868, "sampling/importance_sampling_ratio/min": 0.6090294122695923, "sampling/sampling_logp_difference/max": 0.49588871002197266, "sampling/sampling_logp_difference/mean": 0.015440979041159153, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 184.359375, "completions/mean_terminated_length": 184.359375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.44655194878578186, "epoch": 2.025735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.021481327382425034, "kl": 0.03439909219741821, "learning_rate": 2.9046798249062824e-07, "loss": 0.0003, "num_tokens": 52244777.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6057103872299194, "sampling/importance_sampling_ratio/mean": 1.0004479885101318, "sampling/importance_sampling_ratio/min": 0.7303872108459473, "sampling/sampling_logp_difference/max": 0.47356629371643066, "sampling/sampling_logp_difference/mean": 0.01628941111266613, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 260.484375, "completions/mean_terminated_length": 260.484375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.43018054962158203, "epoch": 2.0269607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6781226854264821, "kl": 0.03034813329577446, "learning_rate": 2.898213858452173e-07, "loss": -0.0181, "num_tokens": 52280520.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4397597312927246, "sampling/importance_sampling_ratio/mean": 0.9999618530273438, "sampling/importance_sampling_ratio/min": 0.6438822746276855, "sampling/sampling_logp_difference/max": 0.44023942947387695, "sampling/sampling_logp_difference/mean": 0.014018382877111435, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 185.390625, "completions/mean_terminated_length": 185.390625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3812934458255768, "epoch": 2.028186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.020154181701656497, "kl": 0.034681811928749084, "learning_rate": 2.891752158490778e-07, "loss": 0.0003, "num_tokens": 52309889.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6463236808776855, "sampling/importance_sampling_ratio/mean": 0.9999797344207764, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.49854469299316406, "sampling/sampling_logp_difference/mean": 0.013666651211678982, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 296.59375, "completions/mean_terminated_length": 296.59375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4506591260433197, "epoch": 2.0294117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 0.9232126386733178, "kl": 0.04330100864171982, "learning_rate": 2.8852947381389405e-07, "loss": 0.0557, "num_tokens": 52348679.0, "reward": 0.6875, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.952916145324707, "sampling/importance_sampling_ratio/mean": 1.0001850128173828, "sampling/importance_sampling_ratio/min": 0.6680294275283813, "sampling/sampling_logp_difference/max": 0.6693236827850342, "sampling/sampling_logp_difference/mean": 0.013749771751463413, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 242.203125, "completions/mean_terminated_length": 242.203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.40169528126716614, "epoch": 2.030637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.43990065422475777, "kl": 0.033652957528829575, "learning_rate": 2.8788416105048117e-07, "loss": -0.0256, "num_tokens": 52386756.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3932278156280518, "sampling/importance_sampling_ratio/mean": 0.999697744846344, "sampling/importance_sampling_ratio/min": 0.6413841843605042, "sampling/sampling_logp_difference/max": 0.44412660598754883, "sampling/sampling_logp_difference/mean": 0.013752281665802002, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 289.3125, "completions/mean_terminated_length": 289.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.40133070945739746, "epoch": 2.031862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.017636936556527004, "kl": 0.02483372949063778, "learning_rate": 2.8723927886878396e-07, "loss": 0.0002, "num_tokens": 52424344.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3494555950164795, "sampling/importance_sampling_ratio/mean": 1.0002000331878662, "sampling/importance_sampling_ratio/min": 0.6392406225204468, "sampling/sampling_logp_difference/max": 0.4474743604660034, "sampling/sampling_logp_difference/mean": 0.014143183827400208, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 162.421875, "completions/mean_terminated_length": 162.421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.32380715012550354, "epoch": 2.0330882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.022417009948019623, "kl": 0.03482438996434212, "learning_rate": 2.865948285778713e-07, "loss": 0.0003, "num_tokens": 52446899.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3108482360839844, "sampling/importance_sampling_ratio/mean": 0.9990267157554626, "sampling/importance_sampling_ratio/min": 0.6505879759788513, "sampling/sampling_logp_difference/max": 0.42987871170043945, "sampling/sampling_logp_difference/mean": 0.013643001206219196, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 208.890625, "completions/mean_terminated_length": 208.890625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3969336152076721, "epoch": 2.034313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.017978950074423048, "kl": 0.028416968882083893, "learning_rate": 2.8595081148593737e-07, "loss": 0.0003, "num_tokens": 52478204.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3293585777282715, "sampling/importance_sampling_ratio/mean": 1.0005228519439697, "sampling/importance_sampling_ratio/min": 0.7291538715362549, "sampling/sampling_logp_difference/max": 0.3158705234527588, "sampling/sampling_logp_difference/mean": 0.013309773057699203, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 183.515625, "completions/mean_terminated_length": 183.515625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.4349133372306824, "epoch": 2.0355392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8251669417316957, "kl": 0.061835259199142456, "learning_rate": 2.8530722890029534e-07, "loss": 0.0128, "num_tokens": 52506173.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.423336148262024, "sampling/importance_sampling_ratio/mean": 0.9992690086364746, "sampling/importance_sampling_ratio/min": 0.6807805895805359, "sampling/sampling_logp_difference/max": 0.38451528549194336, "sampling/sampling_logp_difference/mean": 0.015650738030672073, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 212.671875, "completions/mean_terminated_length": 212.671875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.4570426046848297, "epoch": 2.036764705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.1227828641953626, "kl": 0.074189193546772, "learning_rate": 2.8466408212737776e-07, "loss": 0.0187, "num_tokens": 52535128.0, "reward": 0.0625, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.629642128944397, "sampling/importance_sampling_ratio/mean": 0.9999319314956665, "sampling/importance_sampling_ratio/min": 0.44578036665916443, "sampling/sampling_logp_difference/max": 0.8079289197921753, "sampling/sampling_logp_difference/mean": 0.01558186300098896, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4032864570617676, "epoch": 2.0379901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8918525831586327, "kl": 0.03512345626950264, "learning_rate": 2.840213724727315e-07, "loss": 0.0653, "num_tokens": 52567776.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0001716613769531, "sampling/importance_sampling_ratio/min": 0.6171543002128601, "sampling/sampling_logp_difference/max": 0.4826362133026123, "sampling/sampling_logp_difference/mean": 0.014290999621152878, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 229.078125, "completions/mean_terminated_length": 229.078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.39683741331100464, "epoch": 2.0392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.018637937810432412, "kl": 0.02914087474346161, "learning_rate": 2.8337910124101625e-07, "loss": 0.0003, "num_tokens": 52598501.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.484952688217163, "sampling/importance_sampling_ratio/mean": 1.0001593828201294, "sampling/importance_sampling_ratio/min": 0.6945454478263855, "sampling/sampling_logp_difference/max": 0.3953828811645508, "sampling/sampling_logp_difference/mean": 0.01495380885899067, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 241.359375, "completions/mean_terminated_length": 241.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5107121467590332, "epoch": 2.0404411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.7397664210671164, "kl": 0.1048065572977066, "learning_rate": 2.8273726973600254e-07, "loss": -0.0008, "num_tokens": 52634892.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4599852561950684, "sampling/importance_sampling_ratio/mean": 1.0000249147415161, "sampling/importance_sampling_ratio/min": 0.6448967456817627, "sampling/sampling_logp_difference/max": 0.4386650323867798, "sampling/sampling_logp_difference/mean": 0.017005791887640953, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 214.640625, "completions/mean_terminated_length": 214.640625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.44128990173339844, "epoch": 2.0416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.019255397438440618, "kl": 0.029830558225512505, "learning_rate": 2.8209587926056687e-07, "loss": 0.0003, "num_tokens": 52669157.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5841492414474487, "sampling/importance_sampling_ratio/mean": 1.0002622604370117, "sampling/importance_sampling_ratio/min": 0.7802820205688477, "sampling/sampling_logp_difference/max": 0.46004748344421387, "sampling/sampling_logp_difference/mean": 0.015293768607079983, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 212.28125, "completions/mean_terminated_length": 212.28125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4199128746986389, "epoch": 2.042892156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7731108336925153, "kl": 0.0362180694937706, "learning_rate": 2.8145493111669183e-07, "loss": -0.0057, "num_tokens": 52698519.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.8718528747558594, "sampling/importance_sampling_ratio/mean": 0.9999653100967407, "sampling/importance_sampling_ratio/min": 0.614104151725769, "sampling/sampling_logp_difference/max": 0.6269288063049316, "sampling/sampling_logp_difference/mean": 0.015863358974456787, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 245.078125, "completions/mean_terminated_length": 245.078125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3867439031600952, "epoch": 2.0441176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.01829240269316071, "kl": 0.036024175584316254, "learning_rate": 2.808144266054612e-07, "loss": 0.0004, "num_tokens": 52734428.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4562289714813232, "sampling/importance_sampling_ratio/mean": 1.0001846551895142, "sampling/importance_sampling_ratio/min": 0.6771669387817383, "sampling/sampling_logp_difference/max": 0.389837384223938, "sampling/sampling_logp_difference/mean": 0.012600721791386604, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 212.84375, "completions/mean_terminated_length": 212.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.42237645387649536, "epoch": 2.045343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 1.0194288976480008, "kl": 0.060419388115406036, "learning_rate": 2.80174367027059e-07, "loss": 0.0089, "num_tokens": 52762674.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3720364570617676, "sampling/importance_sampling_ratio/mean": 1.0000308752059937, "sampling/importance_sampling_ratio/min": 0.7257882952690125, "sampling/sampling_logp_difference/max": 0.32049691677093506, "sampling/sampling_logp_difference/mean": 0.014276196248829365, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 211.734375, "completions/mean_terminated_length": 211.734375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.44351261854171753, "epoch": 2.0465686274509802, "frac_reward_zero_std": 0.5, "grad_norm": 1.0851909857559852, "kl": 0.059834472835063934, "learning_rate": 2.795347536807653e-07, "loss": -0.0173, "num_tokens": 52790353.0, "reward": 0.71875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997990131378174, "sampling/importance_sampling_ratio/min": 0.6218530535697937, "sampling/sampling_logp_difference/max": 0.6934218406677246, "sampling/sampling_logp_difference/mean": 0.015442324802279472, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 235.34375, "completions/mean_terminated_length": 235.34375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.491445392370224, "epoch": 2.047794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.912618613009257, "kl": 0.029086031019687653, "learning_rate": 2.7889558786495455e-07, "loss": 0.063, "num_tokens": 52821719.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3871740102767944, "sampling/importance_sampling_ratio/mean": 0.9998846054077148, "sampling/importance_sampling_ratio/min": 0.6204485893249512, "sampling/sampling_logp_difference/max": 0.4773125648498535, "sampling/sampling_logp_difference/mean": 0.01591584086418152, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.31475579738616943, "epoch": 2.049019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.05779936415975806, "kl": 0.03842802345752716, "learning_rate": 2.782568708770933e-07, "loss": 0.0004, "num_tokens": 52853599.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.294599175453186, "sampling/importance_sampling_ratio/mean": 1.0002093315124512, "sampling/importance_sampling_ratio/min": 0.6579135060310364, "sampling/sampling_logp_difference/max": 0.4186818599700928, "sampling/sampling_logp_difference/mean": 0.012618561275303364, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3651142716407776, "epoch": 2.0502450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.021175015253094443, "kl": 0.03167567402124405, "learning_rate": 2.7761860401373627e-07, "loss": 0.0003, "num_tokens": 52885103.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6355892419815063, "sampling/importance_sampling_ratio/mean": 0.9996398687362671, "sampling/importance_sampling_ratio/min": 0.6303655505180359, "sampling/sampling_logp_difference/max": 0.4920032024383545, "sampling/sampling_logp_difference/mean": 0.013990361243486404, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 240.796875, "completions/mean_terminated_length": 240.796875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37054872512817383, "epoch": 2.051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6507139850082363, "kl": 0.02488752454519272, "learning_rate": 2.7698078857052474e-07, "loss": -0.0175, "num_tokens": 52914850.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6448520421981812, "sampling/importance_sampling_ratio/mean": 0.9998887181282043, "sampling/importance_sampling_ratio/min": 0.5966106653213501, "sampling/sampling_logp_difference/max": 0.5164905786514282, "sampling/sampling_logp_difference/mean": 0.013555832207202911, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 243.90625, "completions/mean_terminated_length": 243.90625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.40336084365844727, "epoch": 2.0526960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.7497345971673042, "kl": 0.039283160120248795, "learning_rate": 2.763434258421836e-07, "loss": -0.0003, "num_tokens": 52948236.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.546738624572754, "sampling/importance_sampling_ratio/mean": 1.0000205039978027, "sampling/importance_sampling_ratio/min": 0.606302797794342, "sampling/sampling_logp_difference/max": 0.5003757476806641, "sampling/sampling_logp_difference/mean": 0.01379475649446249, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3703897297382355, "epoch": 2.053921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.016045510247310706, "kl": 0.025694692507386208, "learning_rate": 2.757065171225192e-07, "loss": 0.0002, "num_tokens": 52977532.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4028879404067993, "sampling/importance_sampling_ratio/mean": 1.000049114227295, "sampling/importance_sampling_ratio/min": 0.6918835639953613, "sampling/sampling_logp_difference/max": 0.36833763122558594, "sampling/sampling_logp_difference/mean": 0.01374561432749033, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 207.421875, "completions/mean_terminated_length": 207.421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.442902147769928, "epoch": 2.0551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6825921709536202, "kl": 0.05967596918344498, "learning_rate": 2.750700637044155e-07, "loss": 0.0041, "num_tokens": 53007111.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4362882375717163, "sampling/importance_sampling_ratio/mean": 1.0001070499420166, "sampling/importance_sampling_ratio/min": 0.7374628186225891, "sampling/sampling_logp_difference/max": 0.36206209659576416, "sampling/sampling_logp_difference/mean": 0.013813722878694534, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 276.640625, "completions/mean_terminated_length": 276.640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.46242693066596985, "epoch": 2.0563725490196076, "frac_reward_zero_std": 0.75, "grad_norm": 0.819021851634093, "kl": 0.04801500216126442, "learning_rate": 2.7443406687983264e-07, "loss": 0.0126, "num_tokens": 53045504.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 0.9997797608375549, "sampling/importance_sampling_ratio/min": 0.6948862671852112, "sampling/sampling_logp_difference/max": 0.49199533462524414, "sampling/sampling_logp_difference/mean": 0.014683485962450504, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 219.515625, "completions/mean_terminated_length": 219.515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4543115794658661, "epoch": 2.0575980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.027611047739936837, "kl": 0.048169635236263275, "learning_rate": 2.7379852793980416e-07, "loss": 0.0005, "num_tokens": 53076625.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4253244400024414, "sampling/importance_sampling_ratio/mean": 1.000441074371338, "sampling/importance_sampling_ratio/min": 0.6624775528907776, "sampling/sampling_logp_difference/max": 0.41176867485046387, "sampling/sampling_logp_difference/mean": 0.014917616732418537, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 278.03125, "completions/mean_terminated_length": 278.03125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.5146213173866272, "epoch": 2.0588235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 0.8918473278469441, "kl": 0.042962804436683655, "learning_rate": 2.7316344817443363e-07, "loss": -0.0202, "num_tokens": 53113059.0, "reward": 0.125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.478275179862976, "sampling/importance_sampling_ratio/mean": 1.000291109085083, "sampling/importance_sampling_ratio/min": 0.6280068159103394, "sampling/sampling_logp_difference/max": 0.46520423889160156, "sampling/sampling_logp_difference/mean": 0.015553226694464684, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.42054322361946106, "epoch": 2.060049019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.000222056642928, "kl": 0.07231605052947998, "learning_rate": 2.7252882887289287e-07, "loss": -0.0323, "num_tokens": 53143299.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4710259437561035, "sampling/importance_sampling_ratio/mean": 0.999838650226593, "sampling/importance_sampling_ratio/min": 0.6984808444976807, "sampling/sampling_logp_difference/max": 0.38596010208129883, "sampling/sampling_logp_difference/mean": 0.013918423093855381, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.40502676367759705, "epoch": 2.0612745098039214, "frac_reward_zero_std": 1.0, "grad_norm": 0.02565488586677233, "kl": 0.03213399276137352, "learning_rate": 2.718946713234185e-07, "loss": 0.0003, "num_tokens": 53169831.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.590608835220337, "sampling/importance_sampling_ratio/mean": 1.000605583190918, "sampling/importance_sampling_ratio/min": 0.36815980076789856, "sampling/sampling_logp_difference/max": 0.9992382526397705, "sampling/sampling_logp_difference/mean": 0.015341555699706078, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 238.796875, "completions/mean_terminated_length": 238.796875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.47911858558654785, "epoch": 2.0625, "frac_reward_zero_std": 0.5, "grad_norm": 1.0922856363247173, "kl": 0.0631026178598404, "learning_rate": 2.712609768133106e-07, "loss": -0.0219, "num_tokens": 53209146.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.393763780593872, "sampling/importance_sampling_ratio/mean": 1.000108003616333, "sampling/importance_sampling_ratio/min": 0.7464548945426941, "sampling/sampling_logp_difference/max": 0.33200788497924805, "sampling/sampling_logp_difference/mean": 0.015322668477892876, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4741671085357666, "epoch": 2.063725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.7921750118960504, "kl": 0.027647310867905617, "learning_rate": 2.7062774662892886e-07, "loss": -0.0144, "num_tokens": 53245818.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5185213088989258, "sampling/importance_sampling_ratio/mean": 1.0000309944152832, "sampling/importance_sampling_ratio/min": 0.7082099914550781, "sampling/sampling_logp_difference/max": 0.4177370071411133, "sampling/sampling_logp_difference/mean": 0.015398973599076271, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 252.234375, "completions/mean_terminated_length": 252.234375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4754703640937805, "epoch": 2.064950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.5951646185760013, "kl": 0.050365347415208817, "learning_rate": 2.6999498205569e-07, "loss": 0.0316, "num_tokens": 53279993.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.433864951133728, "sampling/importance_sampling_ratio/mean": 1.0001448392868042, "sampling/importance_sampling_ratio/min": 0.6685059666633606, "sampling/sampling_logp_difference/max": 0.4027099609375, "sampling/sampling_logp_difference/mean": 0.015721773728728294, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4598179757595062, "epoch": 2.0661764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 0.9812591949291634, "kl": 0.08628267049789429, "learning_rate": 2.693626843780665e-07, "loss": -0.0056, "num_tokens": 53310809.0, "reward": 0.09375, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5081480741500854, "sampling/importance_sampling_ratio/mean": 0.9999203681945801, "sampling/importance_sampling_ratio/min": 0.7070103287696838, "sampling/sampling_logp_difference/max": 0.41088247299194336, "sampling/sampling_logp_difference/mean": 0.015428414568305016, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 226.40625, "completions/mean_terminated_length": 226.40625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.392291396856308, "epoch": 2.0674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.016348212507894044, "kl": 0.03218259662389755, "learning_rate": 2.687308548795825e-07, "loss": 0.0003, "num_tokens": 53344883.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4507747888565063, "sampling/importance_sampling_ratio/mean": 1.0000643730163574, "sampling/importance_sampling_ratio/min": 0.6488705277442932, "sampling/sampling_logp_difference/max": 0.4325220584869385, "sampling/sampling_logp_difference/mean": 0.014131030067801476, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 241.296875, "completions/mean_terminated_length": 241.296875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.47790658473968506, "epoch": 2.0686274509803924, "frac_reward_zero_std": 0.5, "grad_norm": 1.2041471119046432, "kl": 0.06056524068117142, "learning_rate": 2.6809949484281164e-07, "loss": -0.0086, "num_tokens": 53386726.0, "reward": 0.0, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5972397327423096, "sampling/importance_sampling_ratio/mean": 0.9999131560325623, "sampling/importance_sampling_ratio/min": 0.5408050417900085, "sampling/sampling_logp_difference/max": 0.6146965026855469, "sampling/sampling_logp_difference/mean": 0.016169361770153046, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 238.328125, "completions/mean_terminated_length": 238.328125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.4980003535747528, "epoch": 2.0698529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.1714652987900576, "kl": 0.05430366098880768, "learning_rate": 2.674686055493748e-07, "loss": 0.0123, "num_tokens": 53419739.0, "reward": 0.3125, "reward_std": 0.42898139357566833, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.2911043167114258, "sampling/importance_sampling_ratio/mean": 1.0000584125518799, "sampling/importance_sampling_ratio/min": 0.7557491660118103, "sampling/sampling_logp_difference/max": 0.2800458073616028, "sampling/sampling_logp_difference/mean": 0.016370002180337906, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 281.78125, "completions/mean_terminated_length": 281.78125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.49693894386291504, "epoch": 2.071078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.5669572112070449, "kl": 0.03916829079389572, "learning_rate": 2.668381882799375e-07, "loss": 0.0005, "num_tokens": 53455693.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3657170534133911, "sampling/importance_sampling_ratio/mean": 0.9997628927230835, "sampling/importance_sampling_ratio/min": 0.6173994541168213, "sampling/sampling_logp_difference/max": 0.4822390079498291, "sampling/sampling_logp_difference/mean": 0.015329066663980484, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 213.1875, "completions/mean_terminated_length": 213.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.38387399911880493, "epoch": 2.0723039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.018357899831684923, "kl": 0.028213728219270706, "learning_rate": 2.662082443142068e-07, "loss": 0.0003, "num_tokens": 53485609.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5345168113708496, "sampling/importance_sampling_ratio/mean": 1.0005815029144287, "sampling/importance_sampling_ratio/min": 0.6108310222625732, "sampling/sampling_logp_difference/max": 0.4929349422454834, "sampling/sampling_logp_difference/mean": 0.01430382952094078, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 240.0625, "completions/mean_terminated_length": 240.0625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.36599016189575195, "epoch": 2.073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.025775944282998488, "kl": 0.03330044820904732, "learning_rate": 2.6557877493092883e-07, "loss": 0.0003, "num_tokens": 53518045.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996206760406494, "sampling/importance_sampling_ratio/min": 0.3689557909965515, "sampling/sampling_logp_difference/max": 0.9970784187316895, "sampling/sampling_logp_difference/mean": 0.013810910284519196, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 239.234375, "completions/mean_terminated_length": 239.234375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3981662392616272, "epoch": 2.0747549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.7185263627841968, "kl": 0.04459173604846001, "learning_rate": 2.6494978140788686e-07, "loss": 0.0165, "num_tokens": 53549660.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3532410860061646, "sampling/importance_sampling_ratio/mean": 1.0003100633621216, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.014453582465648651, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 231.546875, "completions/mean_terminated_length": 231.546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4637501835823059, "epoch": 2.075980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.5817636462228674, "kl": 0.0697750449180603, "learning_rate": 2.643212650218976e-07, "loss": 0.0174, "num_tokens": 53583007.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5472601652145386, "sampling/importance_sampling_ratio/mean": 0.9998758435249329, "sampling/importance_sampling_ratio/min": 0.6351385712623596, "sampling/sampling_logp_difference/max": 0.45391201972961426, "sampling/sampling_logp_difference/mean": 0.01557945366948843, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 245.734375, "completions/mean_terminated_length": 245.734375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4809524714946747, "epoch": 2.077205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.9827785276513454, "kl": 0.033042408525943756, "learning_rate": 2.6369322704881e-07, "loss": 0.0039, "num_tokens": 53620206.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.910003423690796, "sampling/importance_sampling_ratio/mean": 1.0003118515014648, "sampling/importance_sampling_ratio/min": 0.6336769461631775, "sampling/sampling_logp_difference/max": 0.6471049785614014, "sampling/sampling_logp_difference/mean": 0.016739681363105774, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 277.6875, "completions/mean_terminated_length": 277.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3514638841152191, "epoch": 2.0784313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.5419177648619975, "kl": 0.028552263975143433, "learning_rate": 2.6306566876350067e-07, "loss": 0.0003, "num_tokens": 53663482.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.546760082244873, "sampling/importance_sampling_ratio/mean": 1.0000574588775635, "sampling/importance_sampling_ratio/min": 0.683687150478363, "sampling/sampling_logp_difference/max": 0.43616247177124023, "sampling/sampling_logp_difference/mean": 0.012605156749486923, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 190.9375, "completions/mean_terminated_length": 190.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3160659670829773, "epoch": 2.079656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.020635832829643035, "kl": 0.0393621064722538, "learning_rate": 2.6243859143987367e-07, "loss": 0.0003, "num_tokens": 53688582.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4020789861679077, "sampling/importance_sampling_ratio/mean": 1.0002716779708862, "sampling/importance_sampling_ratio/min": 0.7184279561042786, "sampling/sampling_logp_difference/max": 0.3379560708999634, "sampling/sampling_logp_difference/mean": 0.01307186484336853, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 241.546875, "completions/mean_terminated_length": 241.546875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4061009883880615, "epoch": 2.0808823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.030250847742007314, "kl": 0.05404685065150261, "learning_rate": 2.6181199635085616e-07, "loss": 0.0006, "num_tokens": 53718937.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4975636005401611, "sampling/importance_sampling_ratio/mean": 1.0002026557922363, "sampling/importance_sampling_ratio/min": 0.6254689693450928, "sampling/sampling_logp_difference/max": 0.4692535400390625, "sampling/sampling_logp_difference/mean": 0.014528755098581314, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 272.140625, "completions/mean_terminated_length": 272.140625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.41196760535240173, "epoch": 2.082107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.01856909248973148, "kl": 0.02570771798491478, "learning_rate": 2.6118588476839607e-07, "loss": 0.0003, "num_tokens": 53753954.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5613126754760742, "sampling/importance_sampling_ratio/mean": 0.999936580657959, "sampling/importance_sampling_ratio/min": 0.6733855605125427, "sampling/sampling_logp_difference/max": 0.44552695751190186, "sampling/sampling_logp_difference/mean": 0.014957739971578121, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 178.28125, "completions/mean_terminated_length": 178.28125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3256189227104187, "epoch": 2.0833333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.03437947921858951, "kl": 0.040330491960048676, "learning_rate": 2.6056025796346094e-07, "loss": 0.0004, "num_tokens": 53781940.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6014013290405273, "sampling/importance_sampling_ratio/mean": 1.0005395412445068, "sampling/importance_sampling_ratio/min": 0.6476975679397583, "sampling/sampling_logp_difference/max": 0.47087907791137695, "sampling/sampling_logp_difference/mean": 0.013887629844248295, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 217.3125, "completions/mean_terminated_length": 217.3125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.42812711000442505, "epoch": 2.0845588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.6869708520780327, "kl": 0.05290180817246437, "learning_rate": 2.599351172060329e-07, "loss": 0.0003, "num_tokens": 53813640.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4987599849700928, "sampling/importance_sampling_ratio/mean": 0.9994872808456421, "sampling/importance_sampling_ratio/min": 0.5672615170478821, "sampling/sampling_logp_difference/max": 0.5669348239898682, "sampling/sampling_logp_difference/mean": 0.015493819490075111, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 284.328125, "completions/mean_terminated_length": 284.328125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4446394741535187, "epoch": 2.0857843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7015076939413134, "kl": 0.035604920238256454, "learning_rate": 2.593104637651087e-07, "loss": 0.0113, "num_tokens": 53851597.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8148858547210693, "sampling/importance_sampling_ratio/mean": 1.0000540018081665, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.5960226058959961, "sampling/sampling_logp_difference/mean": 0.014582192525267601, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 219.421875, "completions/mean_terminated_length": 219.421875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.43588775396347046, "epoch": 2.0870098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.8601369733440415, "kl": 0.04138219356536865, "learning_rate": 2.5868629890869463e-07, "loss": 0.0365, "num_tokens": 53883176.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4993034601211548, "sampling/importance_sampling_ratio/mean": 0.999489963054657, "sampling/importance_sampling_ratio/min": 0.6461346745491028, "sampling/sampling_logp_difference/max": 0.4367474317550659, "sampling/sampling_logp_difference/mean": 0.015711303800344467, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 262.3125, "completions/mean_terminated_length": 262.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3219606876373291, "epoch": 2.088235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.014227525741447764, "kl": 0.02282378077507019, "learning_rate": 2.580626239038061e-07, "loss": 0.0002, "num_tokens": 53917260.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6047582626342773, "sampling/importance_sampling_ratio/mean": 1.0003926753997803, "sampling/importance_sampling_ratio/min": 0.67872554063797, "sampling/sampling_logp_difference/max": 0.472973108291626, "sampling/sampling_logp_difference/mean": 0.012733912095427513, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 267.40625, "completions/mean_terminated_length": 267.40625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.44568419456481934, "epoch": 2.0894607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.6986790698162758, "kl": 0.03710407763719559, "learning_rate": 2.5743944001646387e-07, "loss": 0.0135, "num_tokens": 53955110.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.408916711807251, "sampling/importance_sampling_ratio/mean": 0.9997654557228088, "sampling/importance_sampling_ratio/min": 0.5261492133140564, "sampling/sampling_logp_difference/max": 0.6421704292297363, "sampling/sampling_logp_difference/mean": 0.015230206772685051, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 181.984375, "completions/mean_terminated_length": 181.984375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.39598989486694336, "epoch": 2.090686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.023557917460605267, "kl": 0.036407310515642166, "learning_rate": 2.568167485116919e-07, "loss": 0.0003, "num_tokens": 53987349.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3268274068832397, "sampling/importance_sampling_ratio/mean": 0.9992355704307556, "sampling/importance_sampling_ratio/min": 0.6835634112358093, "sampling/sampling_logp_difference/max": 0.3804359436035156, "sampling/sampling_logp_difference/mean": 0.015313874930143356, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 194.484375, "completions/mean_terminated_length": 194.484375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.46629300713539124, "epoch": 2.0919117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.8651152702445029, "kl": 0.10407014191150665, "learning_rate": 2.5619455065351435e-07, "loss": 0.0199, "num_tokens": 54019940.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5571781396865845, "sampling/importance_sampling_ratio/mean": 0.9995058178901672, "sampling/importance_sampling_ratio/min": 0.6799058318138123, "sampling/sampling_logp_difference/max": 0.4428752660751343, "sampling/sampling_logp_difference/mean": 0.016040362417697906, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 176.390625, "completions/mean_terminated_length": 176.390625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2929573059082031, "epoch": 2.093137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.022889010555841464, "kl": 0.03555411472916603, "learning_rate": 2.555728477049532e-07, "loss": 0.0004, "num_tokens": 54046653.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.611952304840088, "sampling/importance_sampling_ratio/mean": 0.9993224143981934, "sampling/importance_sampling_ratio/min": 0.5149512887001038, "sampling/sampling_logp_difference/max": 0.6636829376220703, "sampling/sampling_logp_difference/mean": 0.012265045195817947, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 184.6875, "completions/mean_terminated_length": 184.6875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3818640410900116, "epoch": 2.094362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.017828986446728298, "kl": 0.029479539021849632, "learning_rate": 2.5495164092802646e-07, "loss": 0.0003, "num_tokens": 54077881.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5072616338729858, "sampling/importance_sampling_ratio/mean": 0.9999698400497437, "sampling/importance_sampling_ratio/min": 0.7301764488220215, "sampling/sampling_logp_difference/max": 0.4102945327758789, "sampling/sampling_logp_difference/mean": 0.014310354366898537, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 222.484375, "completions/mean_terminated_length": 222.484375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3059423565864563, "epoch": 2.0955882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.5902274709945375, "kl": 0.02456137351691723, "learning_rate": 2.5433093158374437e-07, "loss": 0.013, "num_tokens": 54109768.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5791244506835938, "sampling/importance_sampling_ratio/mean": 0.9997508525848389, "sampling/importance_sampling_ratio/min": 0.5916173458099365, "sampling/sampling_logp_difference/max": 0.524895191192627, "sampling/sampling_logp_difference/mean": 0.011942279525101185, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 258.5625, "completions/mean_terminated_length": 258.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.40893906354904175, "epoch": 2.096813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.6073843168681505, "kl": 0.028748570010066032, "learning_rate": 2.537107209321074e-07, "loss": -0.0074, "num_tokens": 54146076.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4350489377975464, "sampling/importance_sampling_ratio/mean": 0.9998239874839783, "sampling/importance_sampling_ratio/min": 0.6944383382797241, "sampling/sampling_logp_difference/max": 0.3646519184112549, "sampling/sampling_logp_difference/mean": 0.014718063175678253, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 196.328125, "completions/mean_terminated_length": 196.328125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4684864282608032, "epoch": 2.0980392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8973405832334115, "kl": 0.05926259234547615, "learning_rate": 2.5309101023210424e-07, "loss": -0.01, "num_tokens": 54174337.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.488796591758728, "sampling/importance_sampling_ratio/mean": 1.0010432004928589, "sampling/importance_sampling_ratio/min": 0.6786234378814697, "sampling/sampling_logp_difference/max": 0.3979681730270386, "sampling/sampling_logp_difference/mean": 0.01676376909017563, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 213.40625, "completions/mean_terminated_length": 213.40625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3964466452598572, "epoch": 2.099264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02150098145118524, "kl": 0.03202517330646515, "learning_rate": 2.524718007417081e-07, "loss": 0.0003, "num_tokens": 54205211.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5897349119186401, "sampling/importance_sampling_ratio/mean": 0.99957674741745, "sampling/importance_sampling_ratio/min": 0.6785008311271667, "sampling/sampling_logp_difference/max": 0.46356725692749023, "sampling/sampling_logp_difference/mean": 0.01471491064876318, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 234.0625, "completions/mean_terminated_length": 234.0625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4208112955093384, "epoch": 2.1004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8814452155258764, "kl": 0.03885569050908089, "learning_rate": 2.518530937178751e-07, "loss": 0.0124, "num_tokens": 54241231.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.000196933746338, "sampling/importance_sampling_ratio/min": 0.6938435435295105, "sampling/sampling_logp_difference/max": 0.4361441135406494, "sampling/sampling_logp_difference/mean": 0.014252031221985817, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 233.90625, "completions/mean_terminated_length": 233.90625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4084511399269104, "epoch": 2.1017156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.025920117237784642, "kl": 0.038195956498384476, "learning_rate": 2.512348904165411e-07, "loss": 0.0004, "num_tokens": 54274217.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4753884077072144, "sampling/importance_sampling_ratio/mean": 0.9999651908874512, "sampling/importance_sampling_ratio/min": 0.4877993166446686, "sampling/sampling_logp_difference/max": 0.7178511619567871, "sampling/sampling_logp_difference/mean": 0.015018817037343979, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 168.21875, "completions/mean_terminated_length": 168.21875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.33479487895965576, "epoch": 2.1029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.023277676977274776, "kl": 0.030932046473026276, "learning_rate": 2.5061719209262e-07, "loss": 0.0003, "num_tokens": 54299799.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3236008882522583, "sampling/importance_sampling_ratio/mean": 0.9999775886535645, "sampling/importance_sampling_ratio/min": 0.6771601438522339, "sampling/sampling_logp_difference/max": 0.3898475170135498, "sampling/sampling_logp_difference/mean": 0.013862754218280315, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 197.546875, "completions/mean_terminated_length": 197.546875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4255238175392151, "epoch": 2.1041666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.022266585419841105, "kl": 0.03658965975046158, "learning_rate": 2.500000000000001e-07, "loss": 0.0004, "num_tokens": 54332122.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3816039562225342, "sampling/importance_sampling_ratio/mean": 0.9997903108596802, "sampling/importance_sampling_ratio/min": 0.6319694519042969, "sampling/sampling_logp_difference/max": 0.45891427993774414, "sampling/sampling_logp_difference/mean": 0.016192376613616943, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 162.0625, "completions/mean_terminated_length": 162.0625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.26786911487579346, "epoch": 2.105392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.022903144719932732, "kl": 0.027647744864225388, "learning_rate": 2.49383315391542e-07, "loss": 0.0003, "num_tokens": 54356398.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4758132696151733, "sampling/importance_sampling_ratio/mean": 0.9994162321090698, "sampling/importance_sampling_ratio/min": 0.6632112264633179, "sampling/sampling_logp_difference/max": 0.4106616973876953, "sampling/sampling_logp_difference/mean": 0.012282269075512886, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 162.84375, "completions/mean_terminated_length": 162.84375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.34371936321258545, "epoch": 2.1066176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.1992139792879184, "kl": 0.045130655169487, "learning_rate": 2.4876713951907685e-07, "loss": -0.0381, "num_tokens": 54382868.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5425312519073486, "sampling/importance_sampling_ratio/mean": 0.9999926090240479, "sampling/importance_sampling_ratio/min": 0.668948233127594, "sampling/sampling_logp_difference/max": 0.433424711227417, "sampling/sampling_logp_difference/mean": 0.014869507402181625, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 201.890625, "completions/mean_terminated_length": 201.890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.35909855365753174, "epoch": 2.107843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.719253450724179, "kl": 0.058807168155908585, "learning_rate": 2.481514736334022e-07, "loss": 0.0182, "num_tokens": 54410109.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6629894971847534, "sampling/importance_sampling_ratio/mean": 0.9998665452003479, "sampling/importance_sampling_ratio/min": 0.6298564672470093, "sampling/sampling_logp_difference/max": 0.5086169242858887, "sampling/sampling_logp_difference/mean": 0.01432094443589449, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 223.65625, "completions/mean_terminated_length": 223.65625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.44382837414741516, "epoch": 2.1090686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.042830580239373986, "kl": 0.05109231919050217, "learning_rate": 2.4753631898428134e-07, "loss": 0.0005, "num_tokens": 54444903.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4976780414581299, "sampling/importance_sampling_ratio/mean": 1.0000474452972412, "sampling/importance_sampling_ratio/min": 0.6262986660003662, "sampling/sampling_logp_difference/max": 0.4679279327392578, "sampling/sampling_logp_difference/mean": 0.015708569437265396, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 236.765625, "completions/mean_terminated_length": 236.765625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4542069137096405, "epoch": 2.110294117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0527350717176063, "kl": 0.038782984018325806, "learning_rate": 2.4692167682043853e-07, "loss": -0.043, "num_tokens": 54490216.0, "reward": 0.28125, "reward_std": 0.38319888710975647, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4028143882751465, "sampling/importance_sampling_ratio/mean": 0.9994980096817017, "sampling/importance_sampling_ratio/min": 0.6208435297012329, "sampling/sampling_logp_difference/max": 0.47667622566223145, "sampling/sampling_logp_difference/mean": 0.016528688371181488, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 199.46875, "completions/mean_terminated_length": 199.46875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.5112206339836121, "epoch": 2.111519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6291184660248751, "kl": 0.054802052676677704, "learning_rate": 2.4630754838955896e-07, "loss": -0.0041, "num_tokens": 54519750.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5745244026184082, "sampling/importance_sampling_ratio/mean": 0.9998692870140076, "sampling/importance_sampling_ratio/min": 0.5896148085594177, "sampling/sampling_logp_difference/max": 0.5282858610153198, "sampling/sampling_logp_difference/mean": 0.017753848806023598, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 224.96875, "completions/mean_terminated_length": 224.96875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.41554874181747437, "epoch": 2.1127450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 0.9198415011654026, "kl": 0.06762229651212692, "learning_rate": 2.456939349382843e-07, "loss": 0.0265, "num_tokens": 54552964.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3247536420822144, "sampling/importance_sampling_ratio/mean": 1.0001225471496582, "sampling/importance_sampling_ratio/min": 0.6661868691444397, "sampling/sampling_logp_difference/max": 0.4061850309371948, "sampling/sampling_logp_difference/mean": 0.014037791639566422, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 177.96875, "completions/mean_terminated_length": 177.96875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.37972375750541687, "epoch": 2.113970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.01885291795983761, "kl": 0.028345579281449318, "learning_rate": 2.450808377122107e-07, "loss": 0.0003, "num_tokens": 54582034.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.35982346534729, "sampling/importance_sampling_ratio/mean": 1.0004932880401611, "sampling/importance_sampling_ratio/min": 0.7064409852027893, "sampling/sampling_logp_difference/max": 0.3475155830383301, "sampling/sampling_logp_difference/mean": 0.014719485305249691, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 197.078125, "completions/mean_terminated_length": 197.078125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3369651436805725, "epoch": 2.1151960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.018305539672001096, "kl": 0.02534719929099083, "learning_rate": 2.4446825795588716e-07, "loss": 0.0002, "num_tokens": 54614135.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3517695665359497, "sampling/importance_sampling_ratio/mean": 0.999598503112793, "sampling/importance_sampling_ratio/min": 0.6802843809127808, "sampling/sampling_logp_difference/max": 0.38524436950683594, "sampling/sampling_logp_difference/mean": 0.013371542096138, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 242.765625, "completions/mean_terminated_length": 242.765625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.38267743587493896, "epoch": 2.116421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.03252813883627779, "kl": 0.043914198875427246, "learning_rate": 2.438561969128114e-07, "loss": 0.0005, "num_tokens": 54649784.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071556568145752, "sampling/importance_sampling_ratio/mean": 0.9998723268508911, "sampling/importance_sampling_ratio/min": 0.6065890789031982, "sampling/sampling_logp_difference/max": 0.49990367889404297, "sampling/sampling_logp_difference/mean": 0.012969196774065495, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 170.390625, "completions/mean_terminated_length": 170.390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.39506852626800537, "epoch": 2.1176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.0446569400289303, "kl": 0.04495479539036751, "learning_rate": 2.43244655825429e-07, "loss": -0.0077, "num_tokens": 54674865.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.6598515510559082, "sampling/importance_sampling_ratio/mean": 1.0003743171691895, "sampling/importance_sampling_ratio/min": 0.7296468019485474, "sampling/sampling_logp_difference/max": 0.5067281723022461, "sampling/sampling_logp_difference/mean": 0.016044920310378075, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 163.34375, "completions/mean_terminated_length": 163.34375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2899906039237976, "epoch": 2.1188725490196076, "frac_reward_zero_std": 1.0, "grad_norm": 0.027816125360010296, "kl": 0.03442544862627983, "learning_rate": 2.4263363593512903e-07, "loss": 0.0003, "num_tokens": 54700055.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4803112745285034, "sampling/importance_sampling_ratio/mean": 0.9994245767593384, "sampling/importance_sampling_ratio/min": 0.6799276471138, "sampling/sampling_logp_difference/max": 0.39225244522094727, "sampling/sampling_logp_difference/mean": 0.013562307693064213, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 195.96875, "completions/mean_terminated_length": 195.96875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4244498610496521, "epoch": 2.1200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.9310287504765901, "kl": 0.038948509842157364, "learning_rate": 2.4202313848224364e-07, "loss": -0.0042, "num_tokens": 54729957.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5012526512145996, "sampling/importance_sampling_ratio/mean": 0.9999551177024841, "sampling/importance_sampling_ratio/min": 0.6009517312049866, "sampling/sampling_logp_difference/max": 0.5092406272888184, "sampling/sampling_logp_difference/mean": 0.016196755692362785, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 273.15625, "completions/mean_terminated_length": 273.15625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4847327470779419, "epoch": 2.1213235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.015332241408022, "kl": 0.0634094625711441, "learning_rate": 2.414131647060436e-07, "loss": 0.0162, "num_tokens": 54773119.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003219842910767, "sampling/importance_sampling_ratio/min": 0.6625234484672546, "sampling/sampling_logp_difference/max": 0.9516277313232422, "sampling/sampling_logp_difference/mean": 0.0151731688529253, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 193.09375, "completions/mean_terminated_length": 193.09375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.38262057304382324, "epoch": 2.122549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02093126405835027, "kl": 0.02932485193014145, "learning_rate": 2.4080371584473745e-07, "loss": 0.0003, "num_tokens": 54801349.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4112415313720703, "sampling/importance_sampling_ratio/mean": 1.0002636909484863, "sampling/importance_sampling_ratio/min": 0.6678059101104736, "sampling/sampling_logp_difference/max": 0.4037576913833618, "sampling/sampling_logp_difference/mean": 0.01606777496635914, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3532024621963501, "epoch": 2.123774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.028520888685228243, "kl": 0.03388208895921707, "learning_rate": 2.4019479313546757e-07, "loss": 0.0003, "num_tokens": 54840125.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3584169149398804, "sampling/importance_sampling_ratio/mean": 0.9997031092643738, "sampling/importance_sampling_ratio/min": 0.591215193271637, "sampling/sampling_logp_difference/max": 0.5255752205848694, "sampling/sampling_logp_difference/mean": 0.012861925177276134, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 209.453125, "completions/mean_terminated_length": 209.453125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4223269522190094, "epoch": 2.125, "frac_reward_zero_std": 0.75, "grad_norm": 0.9339945437108128, "kl": 0.045595258474349976, "learning_rate": 2.395863978143083e-07, "loss": -0.0341, "num_tokens": 54879226.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000205159187317, "sampling/importance_sampling_ratio/min": 0.6413000822067261, "sampling/sampling_logp_difference/max": 0.9379068613052368, "sampling/sampling_logp_difference/mean": 0.016056055203080177, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 187.0625, "completions/mean_terminated_length": 187.0625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.32577162981033325, "epoch": 2.126225490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.02807865217048386, "kl": 0.03253490477800369, "learning_rate": 2.3897853111626417e-07, "loss": 0.0003, "num_tokens": 54911102.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6307547092437744, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.5113306641578674, "sampling/sampling_logp_difference/max": 0.6707388162612915, "sampling/sampling_logp_difference/mean": 0.012766007333993912, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 155.859375, "completions/mean_terminated_length": 155.859375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.40753477811813354, "epoch": 2.127450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03305307656653402, "kl": 0.03920695185661316, "learning_rate": 2.383711942752652e-07, "loss": 0.0004, "num_tokens": 54939541.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5626815557479858, "sampling/importance_sampling_ratio/mean": 0.9999619722366333, "sampling/importance_sampling_ratio/min": 0.7706030011177063, "sampling/sampling_logp_difference/max": 0.44640326499938965, "sampling/sampling_logp_difference/mean": 0.015735357999801636, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 226.140625, "completions/mean_terminated_length": 226.140625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4630538523197174, "epoch": 2.1286764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.7251203505688483, "kl": 0.05249332636594772, "learning_rate": 2.377643885241674e-07, "loss": -0.0052, "num_tokens": 54977422.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4917874336242676, "sampling/importance_sampling_ratio/mean": 1.0001999139785767, "sampling/importance_sampling_ratio/min": 0.6482202410697937, "sampling/sampling_logp_difference/max": 0.4335247278213501, "sampling/sampling_logp_difference/mean": 0.015336014330387115, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 208.796875, "completions/mean_terminated_length": 208.796875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4857085943222046, "epoch": 2.1299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8431751513723151, "kl": 0.04069080576300621, "learning_rate": 2.371581150947476e-07, "loss": 0.0155, "num_tokens": 55009233.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3904049396514893, "sampling/importance_sampling_ratio/mean": 0.999758243560791, "sampling/importance_sampling_ratio/min": 0.6156575679779053, "sampling/sampling_logp_difference/max": 0.48506438732147217, "sampling/sampling_logp_difference/mean": 0.01667597144842148, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 236.828125, "completions/mean_terminated_length": 236.828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3865073323249817, "epoch": 2.1311274509803924, "frac_reward_zero_std": 0.75, "grad_norm": 0.7945251971627644, "kl": 0.03433482348918915, "learning_rate": 2.3655237521770282e-07, "loss": 0.0227, "num_tokens": 55044326.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.44430410861969, "sampling/importance_sampling_ratio/mean": 0.9998413324356079, "sampling/importance_sampling_ratio/min": 0.639488697052002, "sampling/sampling_logp_difference/max": 0.4470863342285156, "sampling/sampling_logp_difference/mean": 0.013868868350982666, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 225.828125, "completions/mean_terminated_length": 225.828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4157715141773224, "epoch": 2.1323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.023297379077723845, "kl": 0.02754153311252594, "learning_rate": 2.3594717012264642e-07, "loss": 0.0003, "num_tokens": 55078379.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9877310991287231, "sampling/importance_sampling_ratio/mean": 1.000409483909607, "sampling/importance_sampling_ratio/min": 0.5863022208213806, "sampling/sampling_logp_difference/max": 0.6869938373565674, "sampling/sampling_logp_difference/mean": 0.014839432202279568, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 226.671875, "completions/mean_terminated_length": 226.671875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4221544563770294, "epoch": 2.133578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.016172086290548435, "kl": 0.03135847672820091, "learning_rate": 2.3534250103810627e-07, "loss": 0.0003, "num_tokens": 55111334.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7786999940872192, "sampling/importance_sampling_ratio/mean": 1.0000792741775513, "sampling/importance_sampling_ratio/min": 0.6914989948272705, "sampling/sampling_logp_difference/max": 0.5758827924728394, "sampling/sampling_logp_difference/mean": 0.015480948612093925, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4866619110107422, "epoch": 2.1348039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.0196562728494614, "kl": 0.054346490651369095, "learning_rate": 2.3473836919152263e-07, "loss": -0.003, "num_tokens": 55148918.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.8142759799957275, "sampling/importance_sampling_ratio/mean": 1.0004303455352783, "sampling/importance_sampling_ratio/min": 0.4147556722164154, "sampling/sampling_logp_difference/max": 0.8800656795501709, "sampling/sampling_logp_difference/mean": 0.016502011567354202, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 201.453125, "completions/mean_terminated_length": 201.453125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.423952579498291, "epoch": 2.136029411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.7835543064481314, "kl": 0.06791205704212189, "learning_rate": 2.3413477580924475e-07, "loss": 0.0385, "num_tokens": 55179555.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4241691827774048, "sampling/importance_sampling_ratio/mean": 1.0000262260437012, "sampling/importance_sampling_ratio/min": 0.6093135476112366, "sampling/sampling_logp_difference/max": 0.49542236328125, "sampling/sampling_logp_difference/mean": 0.015081222169101238, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 184.328125, "completions/mean_terminated_length": 184.328125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.34962934255599976, "epoch": 2.1372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.018699136234511006, "kl": 0.026226745918393135, "learning_rate": 2.3353172211652884e-07, "loss": 0.0003, "num_tokens": 55212616.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4477838277816772, "sampling/importance_sampling_ratio/mean": 1.000114917755127, "sampling/importance_sampling_ratio/min": 0.6632859706878662, "sampling/sampling_logp_difference/max": 0.4105490446090698, "sampling/sampling_logp_difference/mean": 0.013516898266971111, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 202.203125, "completions/mean_terminated_length": 202.203125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3871409296989441, "epoch": 2.138480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7748667693524827, "kl": 0.043323516845703125, "learning_rate": 2.329292093375356e-07, "loss": -0.0124, "num_tokens": 55241413.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5215389728546143, "sampling/importance_sampling_ratio/mean": 1.0001754760742188, "sampling/importance_sampling_ratio/min": 0.6299866437911987, "sampling/sampling_logp_difference/max": 0.46205663681030273, "sampling/sampling_logp_difference/mean": 0.014800415374338627, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.4962119162082672, "epoch": 2.139705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.030377870768547713, "kl": 0.06676479429006577, "learning_rate": 2.3232723869532816e-07, "loss": 0.0007, "num_tokens": 55271945.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4092849493026733, "sampling/importance_sampling_ratio/mean": 0.9996241331100464, "sampling/importance_sampling_ratio/min": 0.6529548168182373, "sampling/sampling_logp_difference/max": 0.42624735832214355, "sampling/sampling_logp_difference/mean": 0.016974840313196182, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 198.328125, "completions/mean_terminated_length": 198.328125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4542536735534668, "epoch": 2.1409313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 1.1187618944155129, "kl": 0.0751677006483078, "learning_rate": 2.3172581141186858e-07, "loss": -0.0114, "num_tokens": 55299358.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4223324060440063, "sampling/importance_sampling_ratio/mean": 0.9998506903648376, "sampling/importance_sampling_ratio/min": 0.6970131993293762, "sampling/sampling_logp_difference/max": 0.36095094680786133, "sampling/sampling_logp_difference/mean": 0.016000093892216682, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 212.21875, "completions/mean_terminated_length": 212.21875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.41988879442214966, "epoch": 2.142156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.025334713347893854, "kl": 0.04268532991409302, "learning_rate": 2.3112492870801602e-07, "loss": 0.0004, "num_tokens": 55333452.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4602017402648926, "sampling/importance_sampling_ratio/mean": 0.9998769760131836, "sampling/importance_sampling_ratio/min": 0.6052498817443848, "sampling/sampling_logp_difference/max": 0.5021138191223145, "sampling/sampling_logp_difference/mean": 0.015501865185797215, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 203.9375, "completions/mean_terminated_length": 203.9375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.34786057472229004, "epoch": 2.1433823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.019621902407131974, "kl": 0.03109024092555046, "learning_rate": 2.3052459180352458e-07, "loss": 0.0003, "num_tokens": 55366120.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6022891998291016, "sampling/importance_sampling_ratio/mean": 1.0002949237823486, "sampling/importance_sampling_ratio/min": 0.6090093851089478, "sampling/sampling_logp_difference/max": 0.49592161178588867, "sampling/sampling_logp_difference/mean": 0.014146235771477222, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 265.09375, "completions/mean_terminated_length": 265.09375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3917834162712097, "epoch": 2.144607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.015290380997986797, "kl": 0.024292241781949997, "learning_rate": 2.2992480191704e-07, "loss": 0.0002, "num_tokens": 55407694.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9756038188934326, "sampling/importance_sampling_ratio/mean": 1.0001131296157837, "sampling/importance_sampling_ratio/min": 0.6060279607772827, "sampling/sampling_logp_difference/max": 0.6808741092681885, "sampling/sampling_logp_difference/mean": 0.013634631410241127, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 182.53125, "completions/mean_terminated_length": 182.53125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.33123937249183655, "epoch": 2.1458333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.9168386050176408, "kl": 0.043895699083805084, "learning_rate": 2.2932556026609777e-07, "loss": -0.0295, "num_tokens": 55441120.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5277621746063232, "sampling/importance_sampling_ratio/mean": 1.0001862049102783, "sampling/importance_sampling_ratio/min": 0.6155057549476624, "sampling/sampling_logp_difference/max": 0.48531103134155273, "sampling/sampling_logp_difference/mean": 0.012891734018921852, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 257.296875, "completions/mean_terminated_length": 257.296875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4753401577472687, "epoch": 2.1470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.029490820147152666, "kl": 0.03417005389928818, "learning_rate": 2.2872686806712032e-07, "loss": 0.0004, "num_tokens": 55479907.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4761298894882202, "sampling/importance_sampling_ratio/mean": 1.000026822090149, "sampling/importance_sampling_ratio/min": 0.6266134977340698, "sampling/sampling_logp_difference/max": 0.4674253463745117, "sampling/sampling_logp_difference/mean": 0.016516663134098053, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 219.28125, "completions/mean_terminated_length": 219.28125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.5341153144836426, "epoch": 2.1482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.02628875241651288, "kl": 0.045024674385786057, "learning_rate": 2.2812872653541498e-07, "loss": 0.0005, "num_tokens": 55518293.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3663583993911743, "sampling/importance_sampling_ratio/mean": 1.0003935098648071, "sampling/importance_sampling_ratio/min": 0.6626349687576294, "sampling/sampling_logp_difference/max": 0.4115309715270996, "sampling/sampling_logp_difference/mean": 0.017857030034065247, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 219.015625, "completions/mean_terminated_length": 219.015625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4271315634250641, "epoch": 2.1495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7242580390124498, "kl": 0.052868179976940155, "learning_rate": 2.2753113688517155e-07, "loss": 0.0087, "num_tokens": 55555750.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.579295039176941, "sampling/importance_sampling_ratio/mean": 1.000077247619629, "sampling/importance_sampling_ratio/min": 0.5911200046539307, "sampling/sampling_logp_difference/max": 0.5257362127304077, "sampling/sampling_logp_difference/mean": 0.014551831409335136, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 251.65625, "completions/mean_terminated_length": 251.65625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3884498178958893, "epoch": 2.150735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.8531369846460369, "kl": 0.02951735258102417, "learning_rate": 2.2693410032945853e-07, "loss": -0.0039, "num_tokens": 55594352.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5480871200561523, "sampling/importance_sampling_ratio/mean": 1.0000702142715454, "sampling/importance_sampling_ratio/min": 0.7222326397895813, "sampling/sampling_logp_difference/max": 0.43702006340026855, "sampling/sampling_logp_difference/mean": 0.0142079321667552, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 198.078125, "completions/mean_terminated_length": 198.078125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4085216820240021, "epoch": 2.1519607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.018627729681306757, "kl": 0.02471754141151905, "learning_rate": 2.2633761808022272e-07, "loss": 0.0002, "num_tokens": 55627093.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6256582736968994, "sampling/importance_sampling_ratio/mean": 0.9996710419654846, "sampling/importance_sampling_ratio/min": 0.6510365605354309, "sampling/sampling_logp_difference/max": 0.4859127998352051, "sampling/sampling_logp_difference/mean": 0.015194547362625599, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 162.53125, "completions/mean_terminated_length": 162.53125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3991209864616394, "epoch": 2.153186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.7667242884706594, "kl": 0.06161896884441376, "learning_rate": 2.2574169134828526e-07, "loss": -0.0191, "num_tokens": 55652567.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.439162254333496, "sampling/importance_sampling_ratio/mean": 1.0000481605529785, "sampling/importance_sampling_ratio/min": 0.6192695498466492, "sampling/sampling_logp_difference/max": 0.4792146682739258, "sampling/sampling_logp_difference/mean": 0.01625451073050499, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 223.53125, "completions/mean_terminated_length": 223.53125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.5019240379333496, "epoch": 2.1544117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.601437594217833, "kl": 0.0837329775094986, "learning_rate": 2.2514632134333932e-07, "loss": -0.0097, "num_tokens": 55684553.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4870280027389526, "sampling/importance_sampling_ratio/mean": 1.0001258850097656, "sampling/importance_sampling_ratio/min": 0.637239933013916, "sampling/sampling_logp_difference/max": 0.4506089687347412, "sampling/sampling_logp_difference/mean": 0.017832860350608826, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 261.484375, "completions/mean_terminated_length": 261.484375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3401559591293335, "epoch": 2.155637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6481609273172343, "kl": 0.033339258283376694, "learning_rate": 2.2455150927394878e-07, "loss": -0.0147, "num_tokens": 55719624.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4729355573654175, "sampling/importance_sampling_ratio/mean": 0.9997318983078003, "sampling/importance_sampling_ratio/min": 0.629811704158783, "sampling/sampling_logp_difference/max": 0.46233439445495605, "sampling/sampling_logp_difference/mean": 0.011864010244607925, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 233.09375, "completions/mean_terminated_length": 233.09375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.45578038692474365, "epoch": 2.156862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.8057610589224533, "kl": 0.03374443203210831, "learning_rate": 2.2395725634754402e-07, "loss": 0.0215, "num_tokens": 55754254.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4249826669692993, "sampling/importance_sampling_ratio/mean": 1.0005931854248047, "sampling/importance_sampling_ratio/min": 0.6372078061103821, "sampling/sampling_logp_difference/max": 0.45065951347351074, "sampling/sampling_logp_difference/mean": 0.016326431185007095, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 220.1875, "completions/mean_terminated_length": 220.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3218801021575928, "epoch": 2.1580882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.01423016017040416, "kl": 0.02158094383776188, "learning_rate": 2.2336356377042143e-07, "loss": 0.0002, "num_tokens": 55783370.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.516823172569275, "sampling/importance_sampling_ratio/mean": 0.9995269179344177, "sampling/importance_sampling_ratio/min": 0.6208050847053528, "sampling/sampling_logp_difference/max": 0.47673821449279785, "sampling/sampling_logp_difference/mean": 0.013418269343674183, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 181.671875, "completions/mean_terminated_length": 181.671875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.36425530910491943, "epoch": 2.159313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.029893002756899607, "kl": 0.04176114499568939, "learning_rate": 2.2277043274773854e-07, "loss": 0.0004, "num_tokens": 55813365.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007848978042603, "sampling/importance_sampling_ratio/mean": 1.0001916885375977, "sampling/importance_sampling_ratio/min": 0.6393733620643616, "sampling/sampling_logp_difference/max": 0.47049403190612793, "sampling/sampling_logp_difference/mean": 0.01389409601688385, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 169.78125, "completions/mean_terminated_length": 169.78125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.33382275700569153, "epoch": 2.1605392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.020204153809200747, "kl": 0.029215075075626373, "learning_rate": 2.221778644835144e-07, "loss": 0.0003, "num_tokens": 55840471.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7818156480789185, "sampling/importance_sampling_ratio/mean": 1.0002161264419556, "sampling/importance_sampling_ratio/min": 0.6140713691711426, "sampling/sampling_logp_difference/max": 0.5776329040527344, "sampling/sampling_logp_difference/mean": 0.014868385158479214, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 176.390625, "completions/mean_terminated_length": 176.390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.35788238048553467, "epoch": 2.161764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.021212287763992144, "kl": 0.028317486867308617, "learning_rate": 2.215858601806246e-07, "loss": 0.0003, "num_tokens": 55867280.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5563019514083862, "sampling/importance_sampling_ratio/mean": 0.9995063543319702, "sampling/importance_sampling_ratio/min": 0.41623008251190186, "sampling/sampling_logp_difference/max": 0.8765170574188232, "sampling/sampling_logp_difference/mean": 0.014820320531725883, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 189.328125, "completions/mean_terminated_length": 189.328125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.32203784584999084, "epoch": 2.1629901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.02518747976591494, "kl": 0.027678757905960083, "learning_rate": 2.2099442104080075e-07, "loss": 0.0003, "num_tokens": 55893717.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4755914211273193, "sampling/importance_sampling_ratio/mean": 0.9997338652610779, "sampling/importance_sampling_ratio/min": 0.6314525008201599, "sampling/sampling_logp_difference/max": 0.4597325325012207, "sampling/sampling_logp_difference/mean": 0.013499148190021515, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 239.65625, "completions/mean_terminated_length": 239.65625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.46300262212753296, "epoch": 2.1642156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.7057487602303449, "kl": 0.05219618231058121, "learning_rate": 2.2040354826462664e-07, "loss": 0.0082, "num_tokens": 55930383.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6213703155517578, "sampling/importance_sampling_ratio/mean": 0.9996943473815918, "sampling/importance_sampling_ratio/min": 0.6223365664482117, "sampling/sampling_logp_difference/max": 0.48327159881591797, "sampling/sampling_logp_difference/mean": 0.01587051898241043, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 157.671875, "completions/mean_terminated_length": 157.671875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3129045069217682, "epoch": 2.1654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.023785586631170752, "kl": 0.03676670044660568, "learning_rate": 2.1981324305153642e-07, "loss": 0.0003, "num_tokens": 55955546.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6219816207885742, "sampling/importance_sampling_ratio/mean": 1.00071382522583, "sampling/importance_sampling_ratio/min": 0.7006001472473145, "sampling/sampling_logp_difference/max": 0.48364853858947754, "sampling/sampling_logp_difference/mean": 0.014173779636621475, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 235.3125, "completions/mean_terminated_length": 235.3125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.49570128321647644, "epoch": 2.1666666666666665, "frac_reward_zero_std": 0.5, "grad_norm": 1.078351481905372, "kl": 0.09133053570985794, "learning_rate": 2.192235065998126e-07, "loss": -0.0105, "num_tokens": 55989134.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5448896884918213, "sampling/importance_sampling_ratio/mean": 0.998965859413147, "sampling/importance_sampling_ratio/min": 0.6119261384010315, "sampling/sampling_logp_difference/max": 0.49114370346069336, "sampling/sampling_logp_difference/mean": 0.017726827412843704, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 203.765625, "completions/mean_terminated_length": 203.765625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3536602258682251, "epoch": 2.167892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.018581054929521598, "kl": 0.024750471115112305, "learning_rate": 2.1863434010658272e-07, "loss": 0.0002, "num_tokens": 56020303.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5407379865646362, "sampling/importance_sampling_ratio/mean": 1.0003414154052734, "sampling/importance_sampling_ratio/min": 0.7101253867149353, "sampling/sampling_logp_difference/max": 0.43226146697998047, "sampling/sampling_logp_difference/mean": 0.013323700055480003, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 203.578125, "completions/mean_terminated_length": 203.578125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.45446696877479553, "epoch": 2.1691176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.02235162020305461, "kl": 0.035784099251031876, "learning_rate": 2.1804574476781733e-07, "loss": 0.0003, "num_tokens": 56049460.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5750471353530884, "sampling/importance_sampling_ratio/mean": 0.9996752738952637, "sampling/importance_sampling_ratio/min": 0.6301924586296082, "sampling/sampling_logp_difference/max": 0.4617300033569336, "sampling/sampling_logp_difference/mean": 0.016752976924180984, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 170.578125, "completions/mean_terminated_length": 170.578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.29969990253448486, "epoch": 2.170343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0173527336844538, "kl": 0.024074679240584373, "learning_rate": 2.1745772177832755e-07, "loss": 0.0002, "num_tokens": 56079833.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5046863555908203, "sampling/importance_sampling_ratio/mean": 0.9999350905418396, "sampling/importance_sampling_ratio/min": 0.6972651481628418, "sampling/sampling_logp_difference/max": 0.40858447551727295, "sampling/sampling_logp_difference/mean": 0.011612225323915482, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 203.78125, "completions/mean_terminated_length": 203.78125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2922014594078064, "epoch": 2.1715686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.017945475451335936, "kl": 0.023235086351633072, "learning_rate": 2.1687027233176318e-07, "loss": 0.0002, "num_tokens": 56107307.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4358521699905396, "sampling/importance_sampling_ratio/mean": 0.9999434947967529, "sampling/importance_sampling_ratio/min": 0.5521895289421082, "sampling/sampling_logp_difference/max": 0.5938639640808105, "sampling/sampling_logp_difference/mean": 0.012580793350934982, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 275.578125, "completions/mean_terminated_length": 275.578125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3100026249885559, "epoch": 2.172794117647059, "frac_reward_zero_std": 0.5, "grad_norm": 1.0856831069838297, "kl": 0.03622254356741905, "learning_rate": 2.1628339762060914e-07, "loss": 0.0132, "num_tokens": 56144720.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4009132385253906, "sampling/importance_sampling_ratio/mean": 1.000067114830017, "sampling/importance_sampling_ratio/min": 0.6049524545669556, "sampling/sampling_logp_difference/max": 0.5026054382324219, "sampling/sampling_logp_difference/mean": 0.012776928022503853, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 236.984375, "completions/mean_terminated_length": 236.984375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.37184542417526245, "epoch": 2.174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7717181530374922, "kl": 0.055422067642211914, "learning_rate": 2.1569709883618382e-07, "loss": -0.005, "num_tokens": 56183039.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5849214792251587, "sampling/importance_sampling_ratio/mean": 1.000169038772583, "sampling/importance_sampling_ratio/min": 0.6072410941123962, "sampling/sampling_logp_difference/max": 0.49882936477661133, "sampling/sampling_logp_difference/mean": 0.014009572565555573, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 272.984375, "completions/mean_terminated_length": 272.984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4569479823112488, "epoch": 2.1752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.04731987310548251, "kl": 0.052137378603219986, "learning_rate": 2.1511137716863687e-07, "loss": 0.0006, "num_tokens": 56222382.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4306907653808594, "sampling/importance_sampling_ratio/mean": 1.0007603168487549, "sampling/importance_sampling_ratio/min": 0.710547924041748, "sampling/sampling_logp_difference/max": 0.3581573963165283, "sampling/sampling_logp_difference/mean": 0.01567883789539337, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 191.78125, "completions/mean_terminated_length": 191.78125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.39548245072364807, "epoch": 2.176470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.1297765565500506, "kl": 0.07630042731761932, "learning_rate": 2.1452623380694602e-07, "loss": -0.0205, "num_tokens": 56252000.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3939874172210693, "sampling/importance_sampling_ratio/mean": 1.0004687309265137, "sampling/importance_sampling_ratio/min": 0.6412686705589294, "sampling/sampling_logp_difference/max": 0.44430673122406006, "sampling/sampling_logp_difference/mean": 0.015118453651666641, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 238.90625, "completions/mean_terminated_length": 238.90625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.4489264190196991, "epoch": 2.1776960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.023181921215847407, "kl": 0.04035221040248871, "learning_rate": 2.1394166993891526e-07, "loss": 0.0004, "num_tokens": 56289882.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3846248388290405, "sampling/importance_sampling_ratio/mean": 0.9996110796928406, "sampling/importance_sampling_ratio/min": 0.6264980435371399, "sampling/sampling_logp_difference/max": 0.4676096439361572, "sampling/sampling_logp_difference/mean": 0.015457483008503914, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 234.4375, "completions/mean_terminated_length": 234.4375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3741098940372467, "epoch": 2.178921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.8540488270037477, "kl": 0.025974810123443604, "learning_rate": 2.1335768675117205e-07, "loss": 0.026, "num_tokens": 56324518.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5467331409454346, "sampling/importance_sampling_ratio/mean": 1.0000572204589844, "sampling/importance_sampling_ratio/min": 0.6638621687889099, "sampling/sampling_logp_difference/max": 0.4361450672149658, "sampling/sampling_logp_difference/mean": 0.01379794254899025, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 214.828125, "completions/mean_terminated_length": 214.828125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4446960687637329, "epoch": 2.1801470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 1.1561242958987044, "kl": 0.03877747058868408, "learning_rate": 2.1277428542916555e-07, "loss": 0.0157, "num_tokens": 56357611.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.46567964553833, "sampling/importance_sampling_ratio/mean": 1.00065016746521, "sampling/importance_sampling_ratio/min": 0.6390963792800903, "sampling/sampling_logp_difference/max": 0.44770002365112305, "sampling/sampling_logp_difference/mean": 0.016978032886981964, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 208.328125, "completions/mean_terminated_length": 208.328125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3657262325286865, "epoch": 2.1813725490196076, "frac_reward_zero_std": 0.75, "grad_norm": 0.6721604929803945, "kl": 0.03483795002102852, "learning_rate": 2.121914671571633e-07, "loss": 0.0192, "num_tokens": 56385216.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6058988571166992, "sampling/importance_sampling_ratio/mean": 1.0005840063095093, "sampling/importance_sampling_ratio/min": 0.747340977191925, "sampling/sampling_logp_difference/max": 0.47368359565734863, "sampling/sampling_logp_difference/mean": 0.014650316908955574, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 207.359375, "completions/mean_terminated_length": 207.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.36643749475479126, "epoch": 2.1825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.724791888276185, "kl": 0.03202063590288162, "learning_rate": 2.1160923311824934e-07, "loss": 0.0275, "num_tokens": 56416727.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.422170639038086, "sampling/importance_sampling_ratio/mean": 0.9998961091041565, "sampling/importance_sampling_ratio/min": 0.583415687084198, "sampling/sampling_logp_difference/max": 0.5388553142547607, "sampling/sampling_logp_difference/mean": 0.01394546777009964, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 170.984375, "completions/mean_terminated_length": 170.984375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4153890907764435, "epoch": 2.1838235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.8207494487650682, "kl": 0.06203257292509079, "learning_rate": 2.110275844943223e-07, "loss": 0.0188, "num_tokens": 56442854.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3838427066802979, "sampling/importance_sampling_ratio/mean": 0.9999013543128967, "sampling/importance_sampling_ratio/min": 0.6164276003837585, "sampling/sampling_logp_difference/max": 0.4838144779205322, "sampling/sampling_logp_difference/mean": 0.0162960272282362, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 153.390625, "completions/mean_terminated_length": 153.390625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2608410120010376, "epoch": 2.185049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.020464400904348198, "kl": 0.026761263608932495, "learning_rate": 2.1044652246609173e-07, "loss": 0.0003, "num_tokens": 56465183.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4588507413864136, "sampling/importance_sampling_ratio/mean": 1.0002644062042236, "sampling/importance_sampling_ratio/min": 0.6259872913360596, "sampling/sampling_logp_difference/max": 0.46842527389526367, "sampling/sampling_logp_difference/mean": 0.012925646267831326, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 211.484375, "completions/mean_terminated_length": 211.484375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.41743314266204834, "epoch": 2.186274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.7483264520808547, "kl": 0.06146371364593506, "learning_rate": 2.098660482130768e-07, "loss": -0.0007, "num_tokens": 56491886.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4583240747451782, "sampling/importance_sampling_ratio/mean": 0.9997591972351074, "sampling/importance_sampling_ratio/min": 0.6400062441825867, "sampling/sampling_logp_difference/max": 0.446277379989624, "sampling/sampling_logp_difference/mean": 0.015290914103388786, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 163.515625, "completions/mean_terminated_length": 163.515625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.43135976791381836, "epoch": 2.1875, "frac_reward_zero_std": 0.75, "grad_norm": 1.0479395928065414, "kl": 0.053393132984638214, "learning_rate": 2.092861629136033e-07, "loss": -0.0004, "num_tokens": 56518863.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5758846998214722, "sampling/importance_sampling_ratio/mean": 1.0003823041915894, "sampling/importance_sampling_ratio/min": 0.5807136297225952, "sampling/sampling_logp_difference/max": 0.5434975624084473, "sampling/sampling_logp_difference/mean": 0.01628931611776352, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.43314477801322937, "epoch": 2.188725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.039987186921131096, "kl": 0.04892154783010483, "learning_rate": 2.0870686774480196e-07, "loss": 0.0005, "num_tokens": 56548571.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6191405057907104, "sampling/importance_sampling_ratio/mean": 1.0004253387451172, "sampling/importance_sampling_ratio/min": 0.6203228235244751, "sampling/sampling_logp_difference/max": 0.48189544677734375, "sampling/sampling_logp_difference/mean": 0.01644906960427761, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 237.984375, "completions/mean_terminated_length": 237.984375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3843547999858856, "epoch": 2.189950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6781784825639222, "kl": 0.02506193518638611, "learning_rate": 2.0812816388260519e-07, "loss": 0.0216, "num_tokens": 56584634.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003893375396729, "sampling/importance_sampling_ratio/min": 0.642171323299408, "sampling/sampling_logp_difference/max": 0.709916353225708, "sampling/sampling_logp_difference/mean": 0.013689187355339527, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 240.453125, "completions/mean_terminated_length": 240.453125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.44558557868003845, "epoch": 2.1911764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.8566102775947064, "kl": 0.04730532318353653, "learning_rate": 2.0755005250174484e-07, "loss": 0.0261, "num_tokens": 56618487.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6304231882095337, "sampling/importance_sampling_ratio/mean": 1.0002212524414062, "sampling/importance_sampling_ratio/min": 0.6077031493186951, "sampling/sampling_logp_difference/max": 0.49806880950927734, "sampling/sampling_logp_difference/mean": 0.016564399003982544, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 199.84375, "completions/mean_terminated_length": 199.84375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4355708062648773, "epoch": 2.1924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.1387781585557262, "kl": 0.12204273045063019, "learning_rate": 2.0697253477575088e-07, "loss": 0.0301, "num_tokens": 56643789.0, "reward": 0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.9426008462905884, "sampling/importance_sampling_ratio/mean": 0.9996011853218079, "sampling/importance_sampling_ratio/min": 0.49107638001441956, "sampling/sampling_logp_difference/max": 0.7111556529998779, "sampling/sampling_logp_difference/mean": 0.016524486243724823, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 246.515625, "completions/mean_terminated_length": 246.515625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.42478808760643005, "epoch": 2.1936274509803924, "frac_reward_zero_std": 0.5, "grad_norm": 0.9592026126096747, "kl": 0.03127903491258621, "learning_rate": 2.0639561187694733e-07, "loss": -0.0021, "num_tokens": 56675310.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5297869443893433, "sampling/importance_sampling_ratio/mean": 0.9999357461929321, "sampling/importance_sampling_ratio/min": 0.6471536159515381, "sampling/sampling_logp_difference/max": 0.43517160415649414, "sampling/sampling_logp_difference/mean": 0.01433703675866127, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 220.140625, "completions/mean_terminated_length": 220.140625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.44690874218940735, "epoch": 2.1948529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.7668036451017813, "kl": 0.03841719776391983, "learning_rate": 2.0581928497645164e-07, "loss": -0.0074, "num_tokens": 56707623.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.658127784729004, "sampling/importance_sampling_ratio/mean": 1.000002384185791, "sampling/importance_sampling_ratio/min": 0.6332475543022156, "sampling/sampling_logp_difference/max": 0.5056891441345215, "sampling/sampling_logp_difference/mean": 0.01577058434486389, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 182.109375, "completions/mean_terminated_length": 182.109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.406405508518219, "epoch": 2.196078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.7276034273699343, "kl": 0.03541386127471924, "learning_rate": 2.0524355524417015e-07, "loss": 0.0113, "num_tokens": 56737166.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.658131718635559, "sampling/importance_sampling_ratio/mean": 0.9990932941436768, "sampling/importance_sampling_ratio/min": 0.637708842754364, "sampling/sampling_logp_difference/max": 0.5056915283203125, "sampling/sampling_logp_difference/mean": 0.01556492131203413, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 201.359375, "completions/mean_terminated_length": 201.359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4390181303024292, "epoch": 2.1973039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.021430027985534342, "kl": 0.03035741113126278, "learning_rate": 2.0466842384879829e-07, "loss": 0.0003, "num_tokens": 56767061.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4529781341552734, "sampling/importance_sampling_ratio/mean": 0.9994826316833496, "sampling/importance_sampling_ratio/min": 0.7436936497688293, "sampling/sampling_logp_difference/max": 0.3736152648925781, "sampling/sampling_logp_difference/mean": 0.015260099433362484, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 266.5625, "completions/mean_terminated_length": 266.5625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3280014395713806, "epoch": 2.198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.015950563319099935, "kl": 0.03242075443267822, "learning_rate": 2.0409389195781623e-07, "loss": 0.0003, "num_tokens": 56802265.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5215299129486084, "sampling/importance_sampling_ratio/mean": 1.0002336502075195, "sampling/importance_sampling_ratio/min": 0.6139700412750244, "sampling/sampling_logp_difference/max": 0.4878091812133789, "sampling/sampling_logp_difference/mean": 0.01232210174202919, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 186.78125, "completions/mean_terminated_length": 186.78125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.40459394454956055, "epoch": 2.1997549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01660475756175428, "kl": 0.027100378647446632, "learning_rate": 2.0351996073748713e-07, "loss": 0.0003, "num_tokens": 56835403.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4000202417373657, "sampling/importance_sampling_ratio/mean": 1.0003581047058105, "sampling/importance_sampling_ratio/min": 0.6682443022727966, "sampling/sampling_logp_difference/max": 0.40310144424438477, "sampling/sampling_logp_difference/mean": 0.016592102125287056, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 271.796875, "completions/mean_terminated_length": 271.796875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.386542409658432, "epoch": 2.200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.4588857037949476, "kl": 0.027792006731033325, "learning_rate": 2.0294663135285533e-07, "loss": 0.0541, "num_tokens": 56871982.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.480009913444519, "sampling/importance_sampling_ratio/mean": 1.000190019607544, "sampling/importance_sampling_ratio/min": 0.6394152045249939, "sampling/sampling_logp_difference/max": 0.44720131158828735, "sampling/sampling_logp_difference/mean": 0.013207310810685158, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 177.984375, "completions/mean_terminated_length": 177.984375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4077008366584778, "epoch": 2.202205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.0073105546962986, "kl": 0.061981189996004105, "learning_rate": 2.0237390496774282e-07, "loss": -0.0217, "num_tokens": 56897805.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3990243673324585, "sampling/importance_sampling_ratio/mean": 0.9989504814147949, "sampling/importance_sampling_ratio/min": 0.6263276934623718, "sampling/sampling_logp_difference/max": 0.4678816795349121, "sampling/sampling_logp_difference/mean": 0.016141217201948166, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 186.890625, "completions/mean_terminated_length": 186.890625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3266540765762329, "epoch": 2.2034313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.01935019162071562, "kl": 0.028891902416944504, "learning_rate": 2.0180178274474834e-07, "loss": 0.0003, "num_tokens": 56931046.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.600102186203003, "sampling/importance_sampling_ratio/mean": 0.9996902942657471, "sampling/importance_sampling_ratio/min": 0.6493139266967773, "sampling/sampling_logp_difference/max": 0.47006750106811523, "sampling/sampling_logp_difference/mean": 0.01283689122647047, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 207.203125, "completions/mean_terminated_length": 207.203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4144088625907898, "epoch": 2.204656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.020018461654744822, "kl": 0.02925487793982029, "learning_rate": 2.012302658452432e-07, "loss": 0.0003, "num_tokens": 56959715.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9894824028015137, "sampling/importance_sampling_ratio/mean": 1.000123143196106, "sampling/importance_sampling_ratio/min": 0.7063222527503967, "sampling/sampling_logp_difference/max": 0.6878745555877686, "sampling/sampling_logp_difference/mean": 0.01610928401350975, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.42290157079696655, "epoch": 2.2058823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01865298245981651, "kl": 0.031292933970689774, "learning_rate": 2.0065935542937073e-07, "loss": 0.0003, "num_tokens": 56987131.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4991945028305054, "sampling/importance_sampling_ratio/mean": 1.0002739429473877, "sampling/importance_sampling_ratio/min": 0.6922261118888855, "sampling/sampling_logp_difference/max": 0.40492796897888184, "sampling/sampling_logp_difference/mean": 0.017201673239469528, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 200.265625, "completions/mean_terminated_length": 200.265625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.38739973306655884, "epoch": 2.207107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.01689581843543527, "kl": 0.024199094623327255, "learning_rate": 2.0008905265604315e-07, "loss": 0.0002, "num_tokens": 57019180.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5504204034805298, "sampling/importance_sampling_ratio/mean": 0.9998184442520142, "sampling/importance_sampling_ratio/min": 0.6028806567192078, "sampling/sampling_logp_difference/max": 0.5060360431671143, "sampling/sampling_logp_difference/mean": 0.015314217656850815, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 177.765625, "completions/mean_terminated_length": 177.765625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3768065869808197, "epoch": 2.2083333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.02137046244544593, "kl": 0.03218837454915047, "learning_rate": 1.995193586829387e-07, "loss": 0.0003, "num_tokens": 57044493.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4296340942382812, "sampling/importance_sampling_ratio/mean": 0.9995778799057007, "sampling/importance_sampling_ratio/min": 0.6254597306251526, "sampling/sampling_logp_difference/max": 0.4692683219909668, "sampling/sampling_logp_difference/mean": 0.015134826302528381, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 215.78125, "completions/mean_terminated_length": 215.78125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.41760993003845215, "epoch": 2.2095588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.8668926560172997, "kl": 0.027726545929908752, "learning_rate": 1.989502746665001e-07, "loss": -0.0114, "num_tokens": 57073247.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5832754373550415, "sampling/importance_sampling_ratio/mean": 0.999932587146759, "sampling/importance_sampling_ratio/min": 0.6758054494857788, "sampling/sampling_logp_difference/max": 0.45949578285217285, "sampling/sampling_logp_difference/mean": 0.014503438025712967, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 168.203125, "completions/mean_terminated_length": 168.203125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.33378592133522034, "epoch": 2.2107843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.017364213449427, "kl": 0.02794494479894638, "learning_rate": 1.9838180176193176e-07, "loss": 0.0003, "num_tokens": 57111532.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8474539518356323, "sampling/importance_sampling_ratio/mean": 1.0004229545593262, "sampling/importance_sampling_ratio/min": 0.6781023144721985, "sampling/sampling_logp_difference/max": 0.6138083934783936, "sampling/sampling_logp_difference/mean": 0.013987618498504162, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 166.203125, "completions/mean_terminated_length": 166.203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3480472266674042, "epoch": 2.2120098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.017573714854610165, "kl": 0.02866465598344803, "learning_rate": 1.9781394112319787e-07, "loss": 0.0003, "num_tokens": 57136569.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6249321699142456, "sampling/importance_sampling_ratio/mean": 1.0000239610671997, "sampling/importance_sampling_ratio/min": 0.6299121379852295, "sampling/sampling_logp_difference/max": 0.48546600341796875, "sampling/sampling_logp_difference/mean": 0.01445926446467638, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 232.640625, "completions/mean_terminated_length": 232.640625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.46496593952178955, "epoch": 2.213235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.6456536278620773, "kl": 0.03583163022994995, "learning_rate": 1.9724669390301946e-07, "loss": -0.0097, "num_tokens": 57173954.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4635348320007324, "sampling/importance_sampling_ratio/mean": 1.0003316402435303, "sampling/importance_sampling_ratio/min": 0.6590652465820312, "sampling/sampling_logp_difference/max": 0.41693270206451416, "sampling/sampling_logp_difference/mean": 0.015367222018539906, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 187.4375, "completions/mean_terminated_length": 187.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.36628732085227966, "epoch": 2.2144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.8709914404044724, "kl": 0.038709282875061035, "learning_rate": 1.9668006125287228e-07, "loss": -0.0025, "num_tokens": 57200974.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5865304470062256, "sampling/importance_sampling_ratio/mean": 1.0000580549240112, "sampling/importance_sampling_ratio/min": 0.6299977898597717, "sampling/sampling_logp_difference/max": 0.4620389938354492, "sampling/sampling_logp_difference/mean": 0.015021628700196743, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 221.609375, "completions/mean_terminated_length": 221.609375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.35634899139404297, "epoch": 2.215686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.016042609747101415, "kl": 0.028308294713497162, "learning_rate": 1.96114044322985e-07, "loss": 0.0003, "num_tokens": 57229925.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6276687383651733, "sampling/importance_sampling_ratio/mean": 0.9993330240249634, "sampling/importance_sampling_ratio/min": 0.6147381663322449, "sampling/sampling_logp_difference/max": 0.4871487617492676, "sampling/sampling_logp_difference/mean": 0.01397157832980156, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 180.34375, "completions/mean_terminated_length": 180.34375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.35287073254585266, "epoch": 2.2169117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.022584617604283003, "kl": 0.030864350497722626, "learning_rate": 1.9554864426233604e-07, "loss": 0.0003, "num_tokens": 57255003.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7157540321350098, "sampling/importance_sampling_ratio/mean": 0.9992146492004395, "sampling/importance_sampling_ratio/min": 0.32864677906036377, "sampling/sampling_logp_difference/max": 1.11277174949646, "sampling/sampling_logp_difference/mean": 0.015081110410392284, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 130.265625, "completions/mean_terminated_length": 130.265625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.345086932182312, "epoch": 2.218137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.122770356762049, "kl": 0.06593969464302063, "learning_rate": 1.9498386221865165e-07, "loss": 0.0098, "num_tokens": 57276764.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3267107009887695, "sampling/importance_sampling_ratio/mean": 1.0004007816314697, "sampling/importance_sampling_ratio/min": 0.7094165086746216, "sampling/sampling_logp_difference/max": 0.34331250190734863, "sampling/sampling_logp_difference/mean": 0.015385487116873264, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 166.328125, "completions/mean_terminated_length": 166.328125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3845037519931793, "epoch": 2.219362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.027420821102334385, "kl": 0.05962613224983215, "learning_rate": 1.944196993384034e-07, "loss": 0.0006, "num_tokens": 57310689.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002989768981934, "sampling/importance_sampling_ratio/min": 0.548373281955719, "sampling/sampling_logp_difference/max": 0.7172539234161377, "sampling/sampling_logp_difference/mean": 0.016318058595061302, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 183.90625, "completions/mean_terminated_length": 183.90625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3913060128688812, "epoch": 2.2205882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.0367113657543623, "kl": 0.02648971602320671, "learning_rate": 1.9385615676680661e-07, "loss": 0.0149, "num_tokens": 57339099.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4023281335830688, "sampling/importance_sampling_ratio/mean": 1.0001801252365112, "sampling/importance_sampling_ratio/min": 0.5448763370513916, "sampling/sampling_logp_difference/max": 0.6071963310241699, "sampling/sampling_logp_difference/mean": 0.015552809461951256, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3784450888633728, "epoch": 2.221813725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.0020648254354139, "kl": 0.06395342946052551, "learning_rate": 1.932932356478168e-07, "loss": 0.0085, "num_tokens": 57368041.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.6509145498275757, "sampling/importance_sampling_ratio/mean": 1.0001643896102905, "sampling/importance_sampling_ratio/min": 0.7201745510101318, "sampling/sampling_logp_difference/max": 0.5013294219970703, "sampling/sampling_logp_difference/mean": 0.013813063502311707, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 204.03125, "completions/mean_terminated_length": 204.03125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.42454639077186584, "epoch": 2.2230392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.0508410085374955, "kl": 0.03902792930603027, "learning_rate": 1.9273093712412796e-07, "loss": 0.0388, "num_tokens": 57398795.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.618973731994629, "sampling/importance_sampling_ratio/mean": 0.9997764825820923, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4817924499511719, "sampling/sampling_logp_difference/mean": 0.017444204539060593, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 178.390625, "completions/mean_terminated_length": 178.390625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.35003480315208435, "epoch": 2.224264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.02101046059494195, "kl": 0.02411719411611557, "learning_rate": 1.9216926233717084e-07, "loss": 0.0002, "num_tokens": 57429876.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9010097980499268, "sampling/importance_sampling_ratio/mean": 0.9996275901794434, "sampling/importance_sampling_ratio/min": 0.6467365026473999, "sampling/sampling_logp_difference/max": 0.6423852443695068, "sampling/sampling_logp_difference/mean": 0.015089732594788074, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 194.859375, "completions/mean_terminated_length": 194.859375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.37340590357780457, "epoch": 2.2254901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.999134269824642, "kl": 0.046641554683446884, "learning_rate": 1.9160821242710957e-07, "loss": -0.0064, "num_tokens": 57458443.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.603572964668274, "sampling/importance_sampling_ratio/mean": 1.000485897064209, "sampling/importance_sampling_ratio/min": 0.6118009686470032, "sampling/sampling_logp_difference/max": 0.4913482666015625, "sampling/sampling_logp_difference/mean": 0.014087516814470291, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 231.40625, "completions/mean_terminated_length": 231.40625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.33101558685302734, "epoch": 2.2267156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.019865124744391862, "kl": 0.03153214231133461, "learning_rate": 1.9104778853283987e-07, "loss": 0.0003, "num_tokens": 57490725.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6230480670928955, "sampling/importance_sampling_ratio/mean": 1.0005909204483032, "sampling/importance_sampling_ratio/min": 0.6489548087120056, "sampling/sampling_logp_difference/max": 0.48430585861206055, "sampling/sampling_logp_difference/mean": 0.013317778706550598, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 206.546875, "completions/mean_terminated_length": 206.546875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4915628433227539, "epoch": 2.2279411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.1199918148309598, "kl": 0.08405325561761856, "learning_rate": 1.9048799179198655e-07, "loss": 0.0123, "num_tokens": 57517768.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.5523357391357422, "sampling/importance_sampling_ratio/mean": 1.000276803970337, "sampling/importance_sampling_ratio/min": 0.6532287001609802, "sampling/sampling_logp_difference/max": 0.439760684967041, "sampling/sampling_logp_difference/mean": 0.017245426774024963, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 173.9375, "completions/mean_terminated_length": 173.9375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2633485794067383, "epoch": 2.2291666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.023926006425631696, "kl": 0.031033962965011597, "learning_rate": 1.8992882334090188e-07, "loss": 0.0003, "num_tokens": 57543588.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004593133926392, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.8591389656066895, "sampling/sampling_logp_difference/mean": 0.012853178195655346, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 222.265625, "completions/mean_terminated_length": 222.265625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3312186300754547, "epoch": 2.230392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7378282345857047, "kl": 0.020720241591334343, "learning_rate": 1.893702843146623e-07, "loss": 0.0373, "num_tokens": 57578037.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0006163120269775, "sampling/importance_sampling_ratio/min": 0.379047691822052, "sampling/sampling_logp_difference/max": 0.9700932502746582, "sampling/sampling_logp_difference/mean": 0.012796593829989433, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 194.859375, "completions/mean_terminated_length": 194.859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3865714967250824, "epoch": 2.2316176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.019562472786731268, "kl": 0.031382933259010315, "learning_rate": 1.8881237584706632e-07, "loss": 0.0003, "num_tokens": 57606924.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3419557809829712, "sampling/importance_sampling_ratio/mean": 0.9994385838508606, "sampling/importance_sampling_ratio/min": 0.6411448121070862, "sampling/sampling_logp_difference/max": 0.4444999694824219, "sampling/sampling_logp_difference/mean": 0.01487137284129858, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 193.671875, "completions/mean_terminated_length": 193.671875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.38141852617263794, "epoch": 2.232843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.02168403105048124, "kl": 0.025620419532060623, "learning_rate": 1.8825509907063326e-07, "loss": 0.0002, "num_tokens": 57634967.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4173139333724976, "sampling/importance_sampling_ratio/mean": 1.000117301940918, "sampling/importance_sampling_ratio/min": 0.6606555581092834, "sampling/sampling_logp_difference/max": 0.414522647857666, "sampling/sampling_logp_difference/mean": 0.014868944883346558, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 221.90625, "completions/mean_terminated_length": 221.90625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.379791796207428, "epoch": 2.2340686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.9694196703380831, "kl": 0.037543851882219315, "learning_rate": 1.8769845511659927e-07, "loss": -0.0104, "num_tokens": 57667425.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4553523063659668, "sampling/importance_sampling_ratio/mean": 1.0001063346862793, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4679851531982422, "sampling/sampling_logp_difference/mean": 0.014764810912311077, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 177.5625, "completions/mean_terminated_length": 177.5625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.36093395948410034, "epoch": 2.235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.018977386129609426, "kl": 0.027511999011039734, "learning_rate": 1.871424451149169e-07, "loss": 0.0003, "num_tokens": 57696581.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4059081077575684, "sampling/importance_sampling_ratio/mean": 0.999370276927948, "sampling/importance_sampling_ratio/min": 0.7365430593490601, "sampling/sampling_logp_difference/max": 0.3406834602355957, "sampling/sampling_logp_difference/mean": 0.013832824304699898, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 255.015625, "completions/mean_terminated_length": 255.015625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.43517765402793884, "epoch": 2.236519607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.9632358977010006, "kl": 0.06032560020685196, "learning_rate": 1.865870701942504e-07, "loss": -0.0186, "num_tokens": 57735958.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000049114227295, "sampling/importance_sampling_ratio/min": 0.6070228219032288, "sampling/sampling_logp_difference/max": 0.9196538925170898, "sampling/sampling_logp_difference/mean": 0.0153459832072258, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.370273232460022, "epoch": 2.2377450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7517804749863046, "kl": 0.04560757055878639, "learning_rate": 1.8603233148197632e-07, "loss": 0.0059, "num_tokens": 57770118.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999024868011475, "sampling/importance_sampling_ratio/min": 0.6707372069358826, "sampling/sampling_logp_difference/max": 0.8612053394317627, "sampling/sampling_logp_difference/mean": 0.013436258770525455, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 175.234375, "completions/mean_terminated_length": 175.234375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.37318986654281616, "epoch": 2.238970588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9467269096497812, "kl": 0.04705360531806946, "learning_rate": 1.8547823010417873e-07, "loss": 0.0136, "num_tokens": 57795829.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3680918216705322, "sampling/importance_sampling_ratio/mean": 1.0007050037384033, "sampling/importance_sampling_ratio/min": 0.6918064951896667, "sampling/sampling_logp_difference/max": 0.36844897270202637, "sampling/sampling_logp_difference/mean": 0.014947384595870972, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 203.296875, "completions/mean_terminated_length": 203.296875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4300217032432556, "epoch": 2.2401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.032675433367653266, "kl": 0.05343396216630936, "learning_rate": 1.8492476718564866e-07, "loss": 0.0005, "num_tokens": 57827912.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.546739101409912, "sampling/importance_sampling_ratio/mean": 1.0007200241088867, "sampling/importance_sampling_ratio/min": 0.6262629628181458, "sampling/sampling_logp_difference/max": 0.4679849147796631, "sampling/sampling_logp_difference/mean": 0.015370909124612808, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 228.3125, "completions/mean_terminated_length": 228.3125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.40327417850494385, "epoch": 2.241421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.732180587909517, "kl": 0.043199148029088974, "learning_rate": 1.8437194384988058e-07, "loss": 0.0297, "num_tokens": 57860476.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4032870531082153, "sampling/importance_sampling_ratio/mean": 0.9995312094688416, "sampling/importance_sampling_ratio/min": 0.6483083367347717, "sampling/sampling_logp_difference/max": 0.4333888292312622, "sampling/sampling_logp_difference/mean": 0.014346604235470295, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 151.515625, "completions/mean_terminated_length": 151.515625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3040603995323181, "epoch": 2.2426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.028195815879549224, "kl": 0.03291485831141472, "learning_rate": 1.8381976121907067e-07, "loss": 0.0003, "num_tokens": 57885885.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4264153242111206, "sampling/importance_sampling_ratio/mean": 0.9997786283493042, "sampling/importance_sampling_ratio/min": 0.6171460747718811, "sampling/sampling_logp_difference/max": 0.482649564743042, "sampling/sampling_logp_difference/mean": 0.01339528989046812, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 205.921875, "completions/mean_terminated_length": 205.921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3580760657787323, "epoch": 2.2438725490196076, "frac_reward_zero_std": 1.0, "grad_norm": 0.01819422238485712, "kl": 0.02745627611875534, "learning_rate": 1.832682204141152e-07, "loss": 0.0003, "num_tokens": 57917576.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.507186770439148, "sampling/importance_sampling_ratio/mean": 1.0001232624053955, "sampling/importance_sampling_ratio/min": 0.6610642671585083, "sampling/sampling_logp_difference/max": 0.41390419006347656, "sampling/sampling_logp_difference/mean": 0.013558096252381802, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3429805040359497, "epoch": 2.2450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.770708486626644, "kl": 0.05356878042221069, "learning_rate": 1.8271732255460643e-07, "loss": 0.0009, "num_tokens": 57946440.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6120491027832031, "sampling/importance_sampling_ratio/mean": 1.0000370740890503, "sampling/importance_sampling_ratio/min": 0.6269176006317139, "sampling/sampling_logp_difference/max": 0.477506160736084, "sampling/sampling_logp_difference/mean": 0.013620387762784958, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 210.484375, "completions/mean_terminated_length": 210.484375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.40086644887924194, "epoch": 2.2463235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.023943640421089363, "kl": 0.026479825377464294, "learning_rate": 1.8216706875883252e-07, "loss": 0.0003, "num_tokens": 57976967.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.421567440032959, "sampling/importance_sampling_ratio/mean": 1.0002470016479492, "sampling/importance_sampling_ratio/min": 0.7061286568641663, "sampling/sampling_logp_difference/max": 0.3517601490020752, "sampling/sampling_logp_difference/mean": 0.014774276874959469, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 196.28125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.34230923652648926, "epoch": 2.247549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.014822602203528625, "kl": 0.02011169120669365, "learning_rate": 1.816174601437736e-07, "loss": 0.0002, "num_tokens": 58008969.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.449621319770813, "sampling/importance_sampling_ratio/mean": 1.000059723854065, "sampling/importance_sampling_ratio/min": 0.582772433757782, "sampling/sampling_logp_difference/max": 0.5399584770202637, "sampling/sampling_logp_difference/mean": 0.014088565483689308, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 234.609375, "completions/mean_terminated_length": 234.609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.5302902460098267, "epoch": 2.248774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.1187976558045092, "kl": 0.0506209135055542, "learning_rate": 1.8106849782510058e-07, "loss": 0.0394, "num_tokens": 58041856.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997063875198364, "sampling/importance_sampling_ratio/min": 0.48501065373420715, "sampling/sampling_logp_difference/max": 0.9161038398742676, "sampling/sampling_logp_difference/mean": 0.01879747584462166, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 200.515625, "completions/mean_terminated_length": 200.515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.4087119996547699, "epoch": 2.25, "frac_reward_zero_std": 1.0, "grad_norm": 0.04229656470672519, "kl": 0.03463684767484665, "learning_rate": 1.8052018291717215e-07, "loss": 0.0003, "num_tokens": 58078033.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005056858062744, "sampling/importance_sampling_ratio/min": 0.5393726825714111, "sampling/sampling_logp_difference/max": 0.754511833190918, "sampling/sampling_logp_difference/mean": 0.016497399657964706, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 179.328125, "completions/mean_terminated_length": 179.328125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3928197920322418, "epoch": 2.251225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.9492877853035604, "kl": 0.03979340195655823, "learning_rate": 1.7997251653303247e-07, "loss": -0.0348, "num_tokens": 58110102.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.559341549873352, "sampling/importance_sampling_ratio/mean": 1.0000081062316895, "sampling/importance_sampling_ratio/min": 0.6039516925811768, "sampling/sampling_logp_difference/max": 0.5042610168457031, "sampling/sampling_logp_difference/mean": 0.015556180849671364, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 183.671875, "completions/mean_terminated_length": 183.671875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.35463041067123413, "epoch": 2.252450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8284520374198466, "kl": 0.04506409540772438, "learning_rate": 1.7942549978441012e-07, "loss": 0.0121, "num_tokens": 58140977.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5665878057479858, "sampling/importance_sampling_ratio/mean": 1.0000871419906616, "sampling/importance_sampling_ratio/min": 0.6382682919502258, "sampling/sampling_logp_difference/max": 0.44899654388427734, "sampling/sampling_logp_difference/mean": 0.014326345175504684, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 170.34375, "completions/mean_terminated_length": 170.34375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.35513296723365784, "epoch": 2.2536764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.02140885507326921, "kl": 0.0316675566136837, "learning_rate": 1.7887913378171422e-07, "loss": 0.0003, "num_tokens": 58166695.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5436952114105225, "sampling/importance_sampling_ratio/mean": 1.0004980564117432, "sampling/importance_sampling_ratio/min": 0.6534729599952698, "sampling/sampling_logp_difference/max": 0.43417906761169434, "sampling/sampling_logp_difference/mean": 0.015069067478179932, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 219.03125, "completions/mean_terminated_length": 219.03125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.31455889344215393, "epoch": 2.2549019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.016787786466763342, "kl": 0.02507929503917694, "learning_rate": 1.783334196340331e-07, "loss": 0.0003, "num_tokens": 58199289.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5739880800247192, "sampling/importance_sampling_ratio/mean": 1.0001616477966309, "sampling/importance_sampling_ratio/min": 0.6265933513641357, "sampling/sampling_logp_difference/max": 0.46745753288269043, "sampling/sampling_logp_difference/mean": 0.012406616471707821, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 169.953125, "completions/mean_terminated_length": 169.953125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2821255326271057, "epoch": 2.256127450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.017764456952031223, "kl": 0.028326895087957382, "learning_rate": 1.777883584491317e-07, "loss": 0.0003, "num_tokens": 58224102.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.586036205291748, "sampling/importance_sampling_ratio/mean": 0.999875545501709, "sampling/importance_sampling_ratio/min": 0.6367641091346741, "sampling/sampling_logp_difference/max": 0.46123790740966797, "sampling/sampling_logp_difference/mean": 0.012191805988550186, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 174.234375, "completions/mean_terminated_length": 174.234375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.34605711698532104, "epoch": 2.2573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.019113516655231564, "kl": 0.030208708718419075, "learning_rate": 1.7724395133345022e-07, "loss": 0.0003, "num_tokens": 58255989.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6088987588882446, "sampling/importance_sampling_ratio/mean": 1.0002429485321045, "sampling/importance_sampling_ratio/min": 0.6771940588951111, "sampling/sampling_logp_difference/max": 0.47554993629455566, "sampling/sampling_logp_difference/mean": 0.013957983814179897, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 151.796875, "completions/mean_terminated_length": 151.796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3741613030433655, "epoch": 2.258578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0317385444015215, "kl": 0.0608445405960083, "learning_rate": 1.7670019939210023e-07, "loss": 0.0006, "num_tokens": 58281800.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5932728052139282, "sampling/importance_sampling_ratio/mean": 1.0003043413162231, "sampling/importance_sampling_ratio/min": 0.6771954894065857, "sampling/sampling_logp_difference/max": 0.4657902717590332, "sampling/sampling_logp_difference/mean": 0.015265490859746933, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.37429383397102356, "epoch": 2.2598039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.897473156153675, "kl": 0.04108111187815666, "learning_rate": 1.761571037288637e-07, "loss": -0.0059, "num_tokens": 58306072.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.2885618209838867, "sampling/importance_sampling_ratio/mean": 1.0001215934753418, "sampling/importance_sampling_ratio/min": 0.6546775102615356, "sampling/sampling_logp_difference/max": 0.42361247539520264, "sampling/sampling_logp_difference/mean": 0.01529126986861229, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 199.453125, "completions/mean_terminated_length": 199.453125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.4133285880088806, "epoch": 2.261029411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.8305264882597939, "kl": 0.046958938241004944, "learning_rate": 1.7561466544619076e-07, "loss": 0.0054, "num_tokens": 58342485.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4241420030593872, "sampling/importance_sampling_ratio/mean": 0.9996469020843506, "sampling/importance_sampling_ratio/min": 0.698478102684021, "sampling/sampling_logp_difference/max": 0.35885143280029297, "sampling/sampling_logp_difference/mean": 0.015273008495569229, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 148.453125, "completions/mean_terminated_length": 148.453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.30733948945999146, "epoch": 2.2622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.025645417825013117, "kl": 0.029887264594435692, "learning_rate": 1.7507288564519646e-07, "loss": 0.0003, "num_tokens": 58366018.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5554358959197998, "sampling/importance_sampling_ratio/mean": 0.9999924302101135, "sampling/importance_sampling_ratio/min": 0.6394727230072021, "sampling/sampling_logp_difference/max": 0.4471113681793213, "sampling/sampling_logp_difference/mean": 0.014675739221274853, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 247.65625, "completions/mean_terminated_length": 247.65625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.46249935030937195, "epoch": 2.263480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.029007902574953028, "kl": 0.04160034656524658, "learning_rate": 1.7453176542565956e-07, "loss": 0.0004, "num_tokens": 58406588.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9995511770248413, "sampling/importance_sampling_ratio/min": 0.6306003332138062, "sampling/sampling_logp_difference/max": 0.46108293533325195, "sampling/sampling_logp_difference/mean": 0.015138974413275719, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 208.15625, "completions/mean_terminated_length": 208.15625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4643312692642212, "epoch": 2.264705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.023178761766796327, "kl": 0.033480897545814514, "learning_rate": 1.7399130588601968e-07, "loss": 0.0003, "num_tokens": 58445270.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.7730165719985962, "sampling/importance_sampling_ratio/mean": 1.0004382133483887, "sampling/importance_sampling_ratio/min": 0.62339186668396, "sampling/sampling_logp_difference/max": 0.5726823806762695, "sampling/sampling_logp_difference/mean": 0.01806674152612686, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 204.0625, "completions/mean_terminated_length": 204.0625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3384186625480652, "epoch": 2.2659313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.015163834042729219, "kl": 0.02468809485435486, "learning_rate": 1.7345150812337562e-07, "loss": 0.0002, "num_tokens": 58476682.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4060077667236328, "sampling/importance_sampling_ratio/mean": 0.9995001554489136, "sampling/importance_sampling_ratio/min": 0.6893008947372437, "sampling/sampling_logp_difference/max": 0.37207746505737305, "sampling/sampling_logp_difference/mean": 0.013668473809957504, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 236.703125, "completions/mean_terminated_length": 236.703125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.36269164085388184, "epoch": 2.267156862745098, "frac_reward_zero_std": 0.5, "grad_norm": 1.0564679752562487, "kl": 0.03552190959453583, "learning_rate": 1.7291237323348284e-07, "loss": 0.053, "num_tokens": 58507943.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5277749300003052, "sampling/importance_sampling_ratio/mean": 0.9997976422309875, "sampling/importance_sampling_ratio/min": 0.642673909664154, "sampling/sampling_logp_difference/max": 0.4421178102493286, "sampling/sampling_logp_difference/mean": 0.013543002307415009, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 191.171875, "completions/mean_terminated_length": 191.171875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.4077395796775818, "epoch": 2.2683823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.017419935750676126, "kl": 0.028113462030887604, "learning_rate": 1.7237390231075055e-07, "loss": 0.0003, "num_tokens": 58543410.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.610731840133667, "sampling/importance_sampling_ratio/mean": 1.000548005104065, "sampling/importance_sampling_ratio/min": 0.7022053003311157, "sampling/sampling_logp_difference/max": 0.4766886234283447, "sampling/sampling_logp_difference/mean": 0.015693701803684235, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 190.40625, "completions/mean_terminated_length": 190.40625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.38933029770851135, "epoch": 2.269607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.050897849937472595, "kl": 0.04273064807057381, "learning_rate": 1.7183609644824092e-07, "loss": 0.0004, "num_tokens": 58574476.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4586049318313599, "sampling/importance_sampling_ratio/mean": 0.9996222853660583, "sampling/importance_sampling_ratio/min": 0.6622437834739685, "sampling/sampling_logp_difference/max": 0.4121215343475342, "sampling/sampling_logp_difference/mean": 0.01635589264333248, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 207.515625, "completions/mean_terminated_length": 207.515625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.40968212485313416, "epoch": 2.2708333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.7912864685020553, "kl": 0.041417062282562256, "learning_rate": 1.7129895673766575e-07, "loss": 0.0109, "num_tokens": 58604621.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.592902421951294, "sampling/importance_sampling_ratio/mean": 0.999763548374176, "sampling/importance_sampling_ratio/min": 0.635581374168396, "sampling/sampling_logp_difference/max": 0.4655578136444092, "sampling/sampling_logp_difference/mean": 0.01589188538491726, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 197.546875, "completions/mean_terminated_length": 197.546875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3571816384792328, "epoch": 2.2720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01711274830675318, "kl": 0.023957177996635437, "learning_rate": 1.707624842693844e-07, "loss": 0.0002, "num_tokens": 58643168.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5298007726669312, "sampling/importance_sampling_ratio/mean": 0.9997508525848389, "sampling/importance_sampling_ratio/min": 0.6468111872673035, "sampling/sampling_logp_difference/max": 0.4357008934020996, "sampling/sampling_logp_difference/mean": 0.014274682849645615, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 173.765625, "completions/mean_terminated_length": 173.765625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.34523043036460876, "epoch": 2.2732843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.020056058550808338, "kl": 0.03291430324316025, "learning_rate": 1.7022668013240227e-07, "loss": 0.0003, "num_tokens": 58671761.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2966854572296143, "sampling/importance_sampling_ratio/mean": 0.9996277093887329, "sampling/importance_sampling_ratio/min": 0.6234764456748962, "sampling/sampling_logp_difference/max": 0.4724442958831787, "sampling/sampling_logp_difference/mean": 0.013802153058350086, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 175.09375, "completions/mean_terminated_length": 175.09375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.460713267326355, "epoch": 2.2745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.025380685321179753, "kl": 0.04328715801239014, "learning_rate": 1.696915454143676e-07, "loss": 0.0005, "num_tokens": 58699495.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6357195377349854, "sampling/importance_sampling_ratio/mean": 1.00018310546875, "sampling/importance_sampling_ratio/min": 0.6576760411262512, "sampling/sampling_logp_difference/max": 0.4920828342437744, "sampling/sampling_logp_difference/mean": 0.017435822635889053, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 207.796875, "completions/mean_terminated_length": 207.796875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.45708906650543213, "epoch": 2.275735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.8878465244198234, "kl": 0.04913056641817093, "learning_rate": 1.691570812015704e-07, "loss": 0.0004, "num_tokens": 58731610.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6162848472595215, "sampling/importance_sampling_ratio/mean": 0.9995481371879578, "sampling/importance_sampling_ratio/min": 0.7015320658683777, "sampling/sampling_logp_difference/max": 0.4801301956176758, "sampling/sampling_logp_difference/mean": 0.016341743990778923, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 188.484375, "completions/mean_terminated_length": 188.484375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3209288716316223, "epoch": 2.2769607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.017785051769217963, "kl": 0.021816005930304527, "learning_rate": 1.6862328857893855e-07, "loss": 0.0002, "num_tokens": 58760393.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6555925607681274, "sampling/importance_sampling_ratio/mean": 1.0000321865081787, "sampling/importance_sampling_ratio/min": 0.6114137172698975, "sampling/sampling_logp_difference/max": 0.5041589736938477, "sampling/sampling_logp_difference/mean": 0.014840750023722649, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 201.890625, "completions/mean_terminated_length": 201.890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.5158641934394836, "epoch": 2.278186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.9370403207892111, "kl": 0.06312260031700134, "learning_rate": 1.680901686300376e-07, "loss": 0.0314, "num_tokens": 58795218.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4305166006088257, "sampling/importance_sampling_ratio/mean": 1.0003159046173096, "sampling/importance_sampling_ratio/min": 0.6245165467262268, "sampling/sampling_logp_difference/max": 0.4707775115966797, "sampling/sampling_logp_difference/mean": 0.01818527653813362, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 202.546875, "completions/mean_terminated_length": 202.546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4285881519317627, "epoch": 2.2794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.025241984979734046, "kl": 0.034309498965740204, "learning_rate": 1.6755772243706712e-07, "loss": 0.0003, "num_tokens": 58826037.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6094081401824951, "sampling/importance_sampling_ratio/mean": 0.9999540448188782, "sampling/importance_sampling_ratio/min": 0.6262832880020142, "sampling/sampling_logp_difference/max": 0.47586655616760254, "sampling/sampling_logp_difference/mean": 0.01637199893593788, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 207.8125, "completions/mean_terminated_length": 207.8125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.44034770131111145, "epoch": 2.280637254901961, "frac_reward_zero_std": 0.5, "grad_norm": 0.9075536893096079, "kl": 0.060066547244787216, "learning_rate": 1.6702595108085942e-07, "loss": 0.018, "num_tokens": 58860473.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3368315696716309, "sampling/importance_sampling_ratio/mean": 0.999178409576416, "sampling/importance_sampling_ratio/min": 0.6380237936973572, "sampling/sampling_logp_difference/max": 0.44937968254089355, "sampling/sampling_logp_difference/mean": 0.015396175906062126, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 192.296875, "completions/mean_terminated_length": 192.296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.4821186363697052, "epoch": 2.281862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.7283525685923916, "kl": 0.05311132222414017, "learning_rate": 1.6649485564087644e-07, "loss": -0.0138, "num_tokens": 58893596.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5695596933364868, "sampling/importance_sampling_ratio/mean": 1.000570297241211, "sampling/importance_sampling_ratio/min": 0.7008822560310364, "sampling/sampling_logp_difference/max": 0.45079517364501953, "sampling/sampling_logp_difference/mean": 0.01771252416074276, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 157.078125, "completions/mean_terminated_length": 157.078125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.36174434423446655, "epoch": 2.2830882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.026088477295273103, "kl": 0.034807972609996796, "learning_rate": 1.6596443719520826e-07, "loss": 0.0003, "num_tokens": 58919969.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3641308546066284, "sampling/importance_sampling_ratio/mean": 1.0005955696105957, "sampling/importance_sampling_ratio/min": 0.6953715085983276, "sampling/sampling_logp_difference/max": 0.36330902576446533, "sampling/sampling_logp_difference/mean": 0.014958731830120087, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 202.734375, "completions/mean_terminated_length": 202.734375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3093493580818176, "epoch": 2.284313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.016420602162738714, "kl": 0.02179465815424919, "learning_rate": 1.6543469682057104e-07, "loss": 0.0002, "num_tokens": 58949712.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.597166657447815, "sampling/importance_sampling_ratio/mean": 1.000213861465454, "sampling/importance_sampling_ratio/min": 0.6173874139785767, "sampling/sampling_logp_difference/max": 0.48225855827331543, "sampling/sampling_logp_difference/mean": 0.013449499383568764, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 182.671875, "completions/mean_terminated_length": 182.671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.34986114501953125, "epoch": 2.2855392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.018810455421061027, "kl": 0.02657250314950943, "learning_rate": 1.6490563559230357e-07, "loss": 0.0003, "num_tokens": 58976267.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3779224157333374, "sampling/importance_sampling_ratio/mean": 0.9998255372047424, "sampling/importance_sampling_ratio/min": 0.6256580948829651, "sampling/sampling_logp_difference/max": 0.4689512252807617, "sampling/sampling_logp_difference/mean": 0.014288580045104027, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5183290243148804, "epoch": 2.286764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.8244515277802735, "kl": 0.045404620468616486, "learning_rate": 1.6437725458436725e-07, "loss": -0.0018, "num_tokens": 59007979.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.540433406829834, "sampling/importance_sampling_ratio/mean": 1.0007423162460327, "sampling/importance_sampling_ratio/min": 0.6332263946533203, "sampling/sampling_logp_difference/max": 0.4569272994995117, "sampling/sampling_logp_difference/mean": 0.01804598607122898, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.28167539834976196, "epoch": 2.2879901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.9389971339151029, "kl": 0.03808104991912842, "learning_rate": 1.6384955486934154e-07, "loss": -0.0166, "num_tokens": 59034019.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6505603790283203, "sampling/importance_sampling_ratio/mean": 1.0002169609069824, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.5011148452758789, "sampling/sampling_logp_difference/mean": 0.012344785034656525, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 201.3125, "completions/mean_terminated_length": 201.3125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3862236738204956, "epoch": 2.2892156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.02561505746737744, "kl": 0.028078967705368996, "learning_rate": 1.633225375184239e-07, "loss": 0.0003, "num_tokens": 59064039.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.475394368171692, "sampling/importance_sampling_ratio/mean": 1.000014066696167, "sampling/importance_sampling_ratio/min": 0.6452736258506775, "sampling/sampling_logp_difference/max": 0.43808090686798096, "sampling/sampling_logp_difference/mean": 0.015584070235490799, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.40446603298187256, "epoch": 2.2904411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.9237385409935693, "kl": 0.03162943571805954, "learning_rate": 1.6279620360142594e-07, "loss": -0.0055, "num_tokens": 59087935.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3203250169754028, "sampling/importance_sampling_ratio/mean": 0.9996763467788696, "sampling/importance_sampling_ratio/min": 0.6391890048980713, "sampling/sampling_logp_difference/max": 0.4475550651550293, "sampling/sampling_logp_difference/mean": 0.016145117580890656, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 164.15625, "completions/mean_terminated_length": 164.15625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.34835562109947205, "epoch": 2.2916666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.018429910053984654, "kl": 0.025264177471399307, "learning_rate": 1.62270554186772e-07, "loss": 0.0003, "num_tokens": 59112665.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3424992561340332, "sampling/importance_sampling_ratio/mean": 1.0000231266021729, "sampling/importance_sampling_ratio/min": 0.6302787661552429, "sampling/sampling_logp_difference/max": 0.4615931510925293, "sampling/sampling_logp_difference/mean": 0.015209256671369076, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 189.34375, "completions/mean_terminated_length": 189.34375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4359246492385864, "epoch": 2.292892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.042675306786993814, "kl": 0.06339074671268463, "learning_rate": 1.6174559034149737e-07, "loss": 0.0007, "num_tokens": 59143151.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4962064027786255, "sampling/importance_sampling_ratio/mean": 1.0000028610229492, "sampling/importance_sampling_ratio/min": 0.37432149052619934, "sampling/sampling_logp_difference/max": 0.982640266418457, "sampling/sampling_logp_difference/mean": 0.01693284697830677, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 200.421875, "completions/mean_terminated_length": 200.421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.38589876890182495, "epoch": 2.2941176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.9712494212671277, "kl": 0.02685917168855667, "learning_rate": 1.6122131313124538e-07, "loss": 0.0914, "num_tokens": 59174746.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5962004661560059, "sampling/importance_sampling_ratio/mean": 0.99989253282547, "sampling/importance_sampling_ratio/min": 0.6143203377723694, "sampling/sampling_logp_difference/max": 0.4872387647628784, "sampling/sampling_logp_difference/mean": 0.015417475253343582, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 209.390625, "completions/mean_terminated_length": 209.390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.5156528949737549, "epoch": 2.295343137254902, "frac_reward_zero_std": 0.25, "grad_norm": 1.4502040434930394, "kl": 0.14193961024284363, "learning_rate": 1.606977236202654e-07, "loss": -0.0113, "num_tokens": 59204131.0, "reward": -0.15625, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3224138021469116, "sampling/importance_sampling_ratio/mean": 1.0005409717559814, "sampling/importance_sampling_ratio/min": 0.6412133574485779, "sampling/sampling_logp_difference/max": 0.4443930387496948, "sampling/sampling_logp_difference/mean": 0.016879171133041382, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 167.953125, "completions/mean_terminated_length": 167.953125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.36485856771469116, "epoch": 2.2965686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.03095722801658572, "kl": 0.07888477295637131, "learning_rate": 1.6017482287141088e-07, "loss": 0.0007, "num_tokens": 59230272.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.311119556427002, "sampling/importance_sampling_ratio/mean": 1.0003852844238281, "sampling/importance_sampling_ratio/min": 0.47601020336151123, "sampling/sampling_logp_difference/max": 0.7423160076141357, "sampling/sampling_logp_difference/mean": 0.014451163820922375, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 187.546875, "completions/mean_terminated_length": 187.546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.43709540367126465, "epoch": 2.297794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.019095308614842953, "kl": 0.03428737074136734, "learning_rate": 1.5965261194613755e-07, "loss": 0.0003, "num_tokens": 59257747.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.432391881942749, "sampling/importance_sampling_ratio/mean": 1.000012993812561, "sampling/importance_sampling_ratio/min": 0.613330066204071, "sampling/sampling_logp_difference/max": 0.48885202407836914, "sampling/sampling_logp_difference/mean": 0.017625847831368446, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5272804498672485, "epoch": 2.299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9897238317528053, "kl": 0.0817376896739006, "learning_rate": 1.591310919045003e-07, "loss": -0.0088, "num_tokens": 59286787.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5805723667144775, "sampling/importance_sampling_ratio/mean": 0.9994344115257263, "sampling/importance_sampling_ratio/min": 0.6074384450912476, "sampling/sampling_logp_difference/max": 0.4985044002532959, "sampling/sampling_logp_difference/mean": 0.01894564740359783, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 216.296875, "completions/mean_terminated_length": 216.296875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.39964842796325684, "epoch": 2.3002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7348597119880874, "kl": 0.03691429644823074, "learning_rate": 1.5861026380515163e-07, "loss": 0.0311, "num_tokens": 59316134.0, "reward": -0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": -0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5519212484359741, "sampling/importance_sampling_ratio/mean": 1.000169038772583, "sampling/importance_sampling_ratio/min": 0.6844035387039185, "sampling/sampling_logp_difference/max": 0.43949365615844727, "sampling/sampling_logp_difference/mean": 0.015072993002831936, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 257.84375, "completions/mean_terminated_length": 257.84375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.40951257944107056, "epoch": 2.301470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.5938953544983316, "kl": 0.03216740861535072, "learning_rate": 1.5809012870533995e-07, "loss": -0.0389, "num_tokens": 59350492.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0002248287200928, "sampling/importance_sampling_ratio/min": 0.6401075720787048, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.014520933851599693, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 226.734375, "completions/mean_terminated_length": 226.734375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.36163556575775146, "epoch": 2.3026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.016005730270182657, "kl": 0.029519587755203247, "learning_rate": 1.575706876609063e-07, "loss": 0.0003, "num_tokens": 59387131.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6274335384368896, "sampling/importance_sampling_ratio/mean": 1.0002219676971436, "sampling/importance_sampling_ratio/min": 0.7300496697425842, "sampling/sampling_logp_difference/max": 0.48700428009033203, "sampling/sampling_logp_difference/mean": 0.01361087616533041, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 192.703125, "completions/mean_terminated_length": 192.703125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4290919005870819, "epoch": 2.303921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9125959034845414, "kl": 0.028339726850390434, "learning_rate": 1.5705194172628323e-07, "loss": -0.0228, "num_tokens": 59422232.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5048365592956543, "sampling/importance_sampling_ratio/mean": 0.999923586845398, "sampling/importance_sampling_ratio/min": 0.6614570617675781, "sampling/sampling_logp_difference/max": 0.41331028938293457, "sampling/sampling_logp_difference/mean": 0.014484135434031487, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 213.71875, "completions/mean_terminated_length": 213.71875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4463856816291809, "epoch": 2.3051470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.8794570250768244, "kl": 0.027920614928007126, "learning_rate": 1.565338919544918e-07, "loss": 0.0258, "num_tokens": 59456966.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5813827514648438, "sampling/importance_sampling_ratio/mean": 0.9998728036880493, "sampling/importance_sampling_ratio/min": 0.6990068554878235, "sampling/sampling_logp_difference/max": 0.4582996368408203, "sampling/sampling_logp_difference/mean": 0.01653440296649933, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 226.8125, "completions/mean_terminated_length": 226.8125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4854315221309662, "epoch": 2.306372549019608, "frac_reward_zero_std": 0.5, "grad_norm": 1.1680551018341985, "kl": 0.05970364809036255, "learning_rate": 1.5601653939714072e-07, "loss": 0.029, "num_tokens": 59496746.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.620776653289795, "sampling/importance_sampling_ratio/mean": 1.0002939701080322, "sampling/importance_sampling_ratio/min": 0.4305850863456726, "sampling/sampling_logp_difference/max": 0.8426103591918945, "sampling/sampling_logp_difference/mean": 0.015956567600369453, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 205.40625, "completions/mean_terminated_length": 205.40625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3487470746040344, "epoch": 2.3075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.015999218989689683, "kl": 0.022527508437633514, "learning_rate": 1.5549988510442258e-07, "loss": 0.0002, "num_tokens": 59529108.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999996542930603, "sampling/importance_sampling_ratio/min": 0.5221103429794312, "sampling/sampling_logp_difference/max": 0.7264072895050049, "sampling/sampling_logp_difference/mean": 0.014306320808827877, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3543795645236969, "epoch": 2.3088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.017097044265547215, "kl": 0.02797403559088707, "learning_rate": 1.5498393012511285e-07, "loss": 0.0003, "num_tokens": 59557260.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.564490556716919, "sampling/importance_sampling_ratio/mean": 1.0002918243408203, "sampling/importance_sampling_ratio/min": 0.6820248961448669, "sampling/sampling_logp_difference/max": 0.44756031036376953, "sampling/sampling_logp_difference/mean": 0.014995114877820015, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 160.234375, "completions/mean_terminated_length": 160.234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.39390504360198975, "epoch": 2.310049019607843, "frac_reward_zero_std": 0.75, "grad_norm": 1.0198756834092795, "kl": 0.06174597516655922, "learning_rate": 1.5446867550656767e-07, "loss": 0.0043, "num_tokens": 59581579.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4194951057434082, "sampling/importance_sampling_ratio/mean": 0.9993045330047607, "sampling/importance_sampling_ratio/min": 0.692795991897583, "sampling/sampling_logp_difference/max": 0.3670196533203125, "sampling/sampling_logp_difference/mean": 0.015669317916035652, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 171.84375, "completions/mean_terminated_length": 171.84375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3988805413246155, "epoch": 2.311274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 1.1363833495861986, "kl": 0.03849609196186066, "learning_rate": 1.5395412229472103e-07, "loss": -0.0694, "num_tokens": 59616561.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.382973551750183, "sampling/importance_sampling_ratio/mean": 0.9999716281890869, "sampling/importance_sampling_ratio/min": 0.64705890417099, "sampling/sampling_logp_difference/max": 0.4353179931640625, "sampling/sampling_logp_difference/mean": 0.01593279466032982, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 172.015625, "completions/mean_terminated_length": 172.015625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3996015191078186, "epoch": 2.3125, "frac_reward_zero_std": 0.75, "grad_norm": 1.1523603472810369, "kl": 0.039179421961307526, "learning_rate": 1.5344027153408374e-07, "loss": 0.0099, "num_tokens": 59655074.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3922832012176514, "sampling/importance_sampling_ratio/mean": 0.9999604225158691, "sampling/importance_sampling_ratio/min": 0.6298457980155945, "sampling/sampling_logp_difference/max": 0.4622802734375, "sampling/sampling_logp_difference/mean": 0.015173811465501785, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 196.21875, "completions/mean_terminated_length": 196.21875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.32767951488494873, "epoch": 2.313725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.9607446647164233, "kl": 0.024808941408991814, "learning_rate": 1.5292712426773973e-07, "loss": 0.0369, "num_tokens": 59683280.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5017434358596802, "sampling/importance_sampling_ratio/mean": 0.9995311498641968, "sampling/importance_sampling_ratio/min": 0.7266528606414795, "sampling/sampling_logp_difference/max": 0.40662670135498047, "sampling/sampling_logp_difference/mean": 0.01315168384462595, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 193.484375, "completions/mean_terminated_length": 193.484375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.30524688959121704, "epoch": 2.314950980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.015802220938073968, "kl": 0.022244073450565338, "learning_rate": 1.5241468153734594e-07, "loss": 0.0002, "num_tokens": 59721327.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071572065353394, "sampling/importance_sampling_ratio/mean": 0.9994029402732849, "sampling/importance_sampling_ratio/min": 0.6557306051254272, "sampling/sampling_logp_difference/max": 0.42200517654418945, "sampling/sampling_logp_difference/mean": 0.012906955555081367, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3782504200935364, "epoch": 2.3161764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.847135206429144, "kl": 0.06033479794859886, "learning_rate": 1.5190294438312834e-07, "loss": -0.0002, "num_tokens": 59746959.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.300763487815857, "sampling/importance_sampling_ratio/mean": 0.9997039437294006, "sampling/importance_sampling_ratio/min": 0.62606281042099, "sampling/sampling_logp_difference/max": 0.4683046340942383, "sampling/sampling_logp_difference/mean": 0.015130390413105488, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 154.640625, "completions/mean_terminated_length": 154.640625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.37160420417785645, "epoch": 2.3174019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.0895433055967554, "kl": 0.0382518470287323, "learning_rate": 1.5139191384388094e-07, "loss": 0.003, "num_tokens": 59771416.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6412906646728516, "sampling/importance_sampling_ratio/mean": 0.999275267124176, "sampling/importance_sampling_ratio/min": 0.662240207195282, "sampling/sampling_logp_difference/max": 0.4954829216003418, "sampling/sampling_logp_difference/mean": 0.01576092839241028, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 257.046875, "completions/mean_terminated_length": 257.046875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.253875195980072, "epoch": 2.318627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.011137341327517333, "kl": 0.015241425484418869, "learning_rate": 1.5088159095696362e-07, "loss": 0.0001, "num_tokens": 59802907.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6928167343139648, "sampling/importance_sampling_ratio/mean": 1.0005075931549072, "sampling/importance_sampling_ratio/min": 0.6532841324806213, "sampling/sampling_logp_difference/max": 0.5263938903808594, "sampling/sampling_logp_difference/mean": 0.010319976136088371, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 216.09375, "completions/mean_terminated_length": 216.09375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2632845640182495, "epoch": 2.3198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01943818221351062, "kl": 0.024373536929488182, "learning_rate": 1.5037197675829916e-07, "loss": 0.0002, "num_tokens": 59837681.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0005693435668945, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.4361441135406494, "sampling/sampling_logp_difference/mean": 0.011405350640416145, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 221.046875, "completions/mean_terminated_length": 221.046875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3872997760772705, "epoch": 2.321078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.014508614460023527, "kl": 0.023506823927164078, "learning_rate": 1.4986307228237267e-07, "loss": 0.0002, "num_tokens": 59875668.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4615130424499512, "sampling/importance_sampling_ratio/mean": 1.0005359649658203, "sampling/importance_sampling_ratio/min": 0.6202294230461121, "sampling/sampling_logp_difference/max": 0.47766590118408203, "sampling/sampling_logp_difference/mean": 0.014735687524080276, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 195.734375, "completions/mean_terminated_length": 195.734375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.48053818941116333, "epoch": 2.3223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.8543515042561025, "kl": 0.047940317541360855, "learning_rate": 1.4935487856222723e-07, "loss": -0.0152, "num_tokens": 59908611.0, "reward": -0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": -0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.3627339601516724, "sampling/importance_sampling_ratio/mean": 1.0002124309539795, "sampling/importance_sampling_ratio/min": 0.7186917662620544, "sampling/sampling_logp_difference/max": 0.3303227424621582, "sampling/sampling_logp_difference/mean": 0.01626776158809662, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 218.859375, "completions/mean_terminated_length": 218.859375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3811754584312439, "epoch": 2.323529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.016649727675794618, "kl": 0.028688944876194, "learning_rate": 1.4884739662946445e-07, "loss": 0.0003, "num_tokens": 59943594.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 0.999620795249939, "sampling/importance_sampling_ratio/min": 0.5384246110916138, "sampling/sampling_logp_difference/max": 0.6191077828407288, "sampling/sampling_logp_difference/mean": 0.0145262461155653, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 196.234375, "completions/mean_terminated_length": 196.234375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3703479766845703, "epoch": 2.3247549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.015270272827534641, "kl": 0.02394244819879532, "learning_rate": 1.4834062751424015e-07, "loss": 0.0002, "num_tokens": 59976473.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6088929176330566, "sampling/importance_sampling_ratio/mean": 0.9999560713768005, "sampling/importance_sampling_ratio/min": 0.5807967185974121, "sampling/sampling_logp_difference/max": 0.5433545112609863, "sampling/sampling_logp_difference/mean": 0.01434406265616417, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 204.1875, "completions/mean_terminated_length": 204.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3638109564781189, "epoch": 2.325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.013468778188168539, "kl": 0.02090836875140667, "learning_rate": 1.478345722452639e-07, "loss": 0.0002, "num_tokens": 60005701.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5052893161773682, "sampling/importance_sampling_ratio/mean": 0.9993464946746826, "sampling/importance_sampling_ratio/min": 0.5578929781913757, "sampling/sampling_logp_difference/max": 0.5835881233215332, "sampling/sampling_logp_difference/mean": 0.014573956839740276, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 162.984375, "completions/mean_terminated_length": 162.984375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3963279724121094, "epoch": 2.327205882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01904312950438511, "kl": 0.03473995253443718, "learning_rate": 1.4732923184979562e-07, "loss": 0.0003, "num_tokens": 60033428.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.410925269126892, "sampling/importance_sampling_ratio/mean": 0.9995149970054626, "sampling/importance_sampling_ratio/min": 0.6589466333389282, "sampling/sampling_logp_difference/max": 0.4171128273010254, "sampling/sampling_logp_difference/mean": 0.015039639547467232, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 197.515625, "completions/mean_terminated_length": 197.515625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.39951860904693604, "epoch": 2.3284313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.8576159884898699, "kl": 0.0305451862514019, "learning_rate": 1.4682460735364422e-07, "loss": 0.0163, "num_tokens": 60062565.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3865586519241333, "sampling/importance_sampling_ratio/mean": 1.0001012086868286, "sampling/importance_sampling_ratio/min": 0.6482308506965637, "sampling/sampling_logp_difference/max": 0.43350839614868164, "sampling/sampling_logp_difference/mean": 0.014886585995554924, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.2817862927913666, "epoch": 2.329656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.015977006957823323, "kl": 0.01980246603488922, "learning_rate": 1.4632069978116584e-07, "loss": 0.0002, "num_tokens": 60092269.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5823426246643066, "sampling/importance_sampling_ratio/mean": 1.0000944137573242, "sampling/importance_sampling_ratio/min": 0.606020987033844, "sampling/sampling_logp_difference/max": 0.5008406639099121, "sampling/sampling_logp_difference/mean": 0.012155575677752495, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 180.203125, "completions/mean_terminated_length": 180.203125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3786773979663849, "epoch": 2.3308823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.017527550338411666, "kl": 0.02328067645430565, "learning_rate": 1.4581751015526033e-07, "loss": 0.0002, "num_tokens": 60120810.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6157784461975098, "sampling/importance_sampling_ratio/mean": 1.0003771781921387, "sampling/importance_sampling_ratio/min": 0.6090195775032043, "sampling/sampling_logp_difference/max": 0.49590492248535156, "sampling/sampling_logp_difference/mean": 0.015042027458548546, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 179.1875, "completions/mean_terminated_length": 179.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4272390604019165, "epoch": 2.332107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.0790700487473894, "kl": 0.03297770768404007, "learning_rate": 1.4531503949737106e-07, "loss": 0.0314, "num_tokens": 60152566.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5092428922653198, "sampling/importance_sampling_ratio/mean": 1.0000388622283936, "sampling/importance_sampling_ratio/min": 0.6622360348701477, "sampling/sampling_logp_difference/max": 0.41213321685791016, "sampling/sampling_logp_difference/mean": 0.016097504645586014, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.28215962648391724, "epoch": 2.3333333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.019570314729581942, "kl": 0.022349946200847626, "learning_rate": 1.4481328882748184e-07, "loss": 0.0002, "num_tokens": 60180814.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3916409015655518, "sampling/importance_sampling_ratio/mean": 0.9997809529304504, "sampling/importance_sampling_ratio/min": 0.4278160035610199, "sampling/sampling_logp_difference/max": 0.8490620851516724, "sampling/sampling_logp_difference/mean": 0.011633295565843582, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 186.203125, "completions/mean_terminated_length": 186.203125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.37325119972229004, "epoch": 2.3345588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.9482082358343803, "kl": 0.03111817128956318, "learning_rate": 1.4431225916411455e-07, "loss": -0.0226, "num_tokens": 60207579.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.345187783241272, "sampling/importance_sampling_ratio/mean": 1.000281572341919, "sampling/importance_sampling_ratio/min": 0.6062778234481812, "sampling/sampling_logp_difference/max": 0.5004169940948486, "sampling/sampling_logp_difference/mean": 0.014844301156699657, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 198.203125, "completions/mean_terminated_length": 198.203125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.45078957080841064, "epoch": 2.3357843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8205068788653546, "kl": 0.057336561381816864, "learning_rate": 1.4381195152432769e-07, "loss": -0.0238, "num_tokens": 60242392.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5392322540283203, "sampling/importance_sampling_ratio/mean": 0.9996870756149292, "sampling/importance_sampling_ratio/min": 0.5160008668899536, "sampling/sampling_logp_difference/max": 0.661646842956543, "sampling/sampling_logp_difference/mean": 0.017214279621839523, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 148.90625, "completions/mean_terminated_length": 148.90625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.34201350808143616, "epoch": 2.3370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.05245526815709473, "kl": 0.0537988543510437, "learning_rate": 1.4331236692371384e-07, "loss": 0.0005, "num_tokens": 60266034.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.602060079574585, "sampling/importance_sampling_ratio/mean": 0.9996627569198608, "sampling/importance_sampling_ratio/min": 0.699406087398529, "sampling/sampling_logp_difference/max": 0.47129034996032715, "sampling/sampling_logp_difference/mean": 0.01396908238530159, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 176.359375, "completions/mean_terminated_length": 176.359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.41292208433151245, "epoch": 2.338235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.1055960854986142, "kl": 0.033058423548936844, "learning_rate": 1.428135063763985e-07, "loss": -0.042, "num_tokens": 60298713.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.6507264375686646, "sampling/importance_sampling_ratio/mean": 1.0003893375396729, "sampling/importance_sampling_ratio/min": 0.6855221390724182, "sampling/sampling_logp_difference/max": 0.5012154579162598, "sampling/sampling_logp_difference/mean": 0.01686900667846203, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 193.0625, "completions/mean_terminated_length": 193.0625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.374266654253006, "epoch": 2.3394607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.02167045988977778, "kl": 0.04262005537748337, "learning_rate": 1.4231537089503675e-07, "loss": 0.0004, "num_tokens": 60329613.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.539963722229004, "sampling/importance_sampling_ratio/mean": 1.000189185142517, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.015530496835708618, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 260.796875, "completions/mean_terminated_length": 260.796875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4065209925174713, "epoch": 2.340686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.7441707090446501, "kl": 0.024460311979055405, "learning_rate": 1.4181796149081194e-07, "loss": -0.0102, "num_tokens": 60367152.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4363536834716797, "sampling/importance_sampling_ratio/mean": 0.999853789806366, "sampling/importance_sampling_ratio/min": 0.4462917745113373, "sampling/sampling_logp_difference/max": 0.8067823648452759, "sampling/sampling_logp_difference/mean": 0.013409093022346497, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 178.828125, "completions/mean_terminated_length": 178.828125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.37713760137557983, "epoch": 2.3419117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 0.9662019583827675, "kl": 0.05671603977680206, "learning_rate": 1.4132127917343394e-07, "loss": -0.0134, "num_tokens": 60394981.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.566908359527588, "sampling/importance_sampling_ratio/mean": 0.9996801614761353, "sampling/importance_sampling_ratio/min": 0.702597439289093, "sampling/sampling_logp_difference/max": 0.44910454750061035, "sampling/sampling_logp_difference/mean": 0.014646901749074459, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 181.28125, "completions/mean_terminated_length": 181.28125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.44547849893569946, "epoch": 2.343137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.0625432473513456, "kl": 0.032026518136262894, "learning_rate": 1.4082532495113624e-07, "loss": -0.005, "num_tokens": 60423527.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.487159252166748, "sampling/importance_sampling_ratio/mean": 1.0001486539840698, "sampling/importance_sampling_ratio/min": 0.6141946315765381, "sampling/sampling_logp_difference/max": 0.4874434471130371, "sampling/sampling_logp_difference/mean": 0.015257124789059162, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 218.34375, "completions/mean_terminated_length": 218.34375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.26839399337768555, "epoch": 2.344362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.9766257339179132, "kl": 0.028687234967947006, "learning_rate": 1.4033009983067452e-07, "loss": 0.0311, "num_tokens": 60457021.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.407607913017273, "sampling/importance_sampling_ratio/mean": 0.9995027780532837, "sampling/importance_sampling_ratio/min": 0.5688512921333313, "sampling/sampling_logp_difference/max": 0.564136266708374, "sampling/sampling_logp_difference/mean": 0.010518480092287064, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 191.34375, "completions/mean_terminated_length": 191.34375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.42183470726013184, "epoch": 2.3455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.0375708725802242, "kl": 0.033426955342292786, "learning_rate": 1.398356048173242e-07, "loss": -0.0042, "num_tokens": 60489475.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.627564787864685, "sampling/importance_sampling_ratio/mean": 1.0002881288528442, "sampling/importance_sampling_ratio/min": 0.6455309391021729, "sampling/sampling_logp_difference/max": 0.48708486557006836, "sampling/sampling_logp_difference/mean": 0.01605754718184471, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 221.796875, "completions/mean_terminated_length": 221.796875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35900455713272095, "epoch": 2.346813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7729176553637848, "kl": 0.026185456663370132, "learning_rate": 1.3934184091487915e-07, "loss": 0.0099, "num_tokens": 60518086.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.8179771900177002, "sampling/importance_sampling_ratio/mean": 1.0001744031906128, "sampling/importance_sampling_ratio/min": 0.706462562084198, "sampling/sampling_logp_difference/max": 0.597724437713623, "sampling/sampling_logp_difference/mean": 0.013698907569050789, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 160.96875, "completions/mean_terminated_length": 160.96875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.29528480768203735, "epoch": 2.3480392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.01652170370905375, "kl": 0.022979356348514557, "learning_rate": 1.3884880912564873e-07, "loss": 0.0002, "num_tokens": 60544388.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6976975202560425, "sampling/importance_sampling_ratio/mean": 1.0002460479736328, "sampling/importance_sampling_ratio/min": 0.5719316601753235, "sampling/sampling_logp_difference/max": 0.558735728263855, "sampling/sampling_logp_difference/mean": 0.012866152450442314, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 186.9375, "completions/mean_terminated_length": 186.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.31954970955848694, "epoch": 2.349264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.018463198147938467, "kl": 0.025077354162931442, "learning_rate": 1.3835651045045598e-07, "loss": 0.0002, "num_tokens": 60570464.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6093238592147827, "sampling/importance_sampling_ratio/mean": 1.0000442266464233, "sampling/importance_sampling_ratio/min": 0.6098756790161133, "sampling/sampling_logp_difference/max": 0.49450016021728516, "sampling/sampling_logp_difference/mean": 0.014753278344869614, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 204.015625, "completions/mean_terminated_length": 204.015625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3832298517227173, "epoch": 2.3504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.0315911254897412, "kl": 0.02888948656618595, "learning_rate": 1.3786494588863633e-07, "loss": 0.026, "num_tokens": 60610497.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.417617678642273, "sampling/importance_sampling_ratio/mean": 1.0000133514404297, "sampling/importance_sampling_ratio/min": 0.7053484320640564, "sampling/sampling_logp_difference/max": 0.3490633964538574, "sampling/sampling_logp_difference/mean": 0.013524937443435192, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 209.84375, "completions/mean_terminated_length": 209.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3601645231246948, "epoch": 2.3517156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.013180003386095242, "kl": 0.021510332822799683, "learning_rate": 1.3737411643803448e-07, "loss": 0.0002, "num_tokens": 60641943.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4875458478927612, "sampling/importance_sampling_ratio/mean": 0.999901294708252, "sampling/importance_sampling_ratio/min": 0.6780306696891785, "sampling/sampling_logp_difference/max": 0.397127628326416, "sampling/sampling_logp_difference/mean": 0.013994856737554073, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4034101068973541, "epoch": 2.3529411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.3034565320338045, "kl": 0.022126968950033188, "learning_rate": 1.368840230950035e-07, "loss": 0.0277, "num_tokens": 60679679.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.6274250745773315, "sampling/importance_sampling_ratio/mean": 0.9997659921646118, "sampling/importance_sampling_ratio/min": 0.620280385017395, "sampling/sampling_logp_difference/max": 0.4869990348815918, "sampling/sampling_logp_difference/mean": 0.014682994224131107, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 202.46875, "completions/mean_terminated_length": 202.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3586995303630829, "epoch": 2.3541666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.013093767728225337, "kl": 0.02074764110147953, "learning_rate": 1.3639466685440132e-07, "loss": 0.0002, "num_tokens": 60710285.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9231685400009155, "sampling/importance_sampling_ratio/mean": 1.0003397464752197, "sampling/importance_sampling_ratio/min": 0.6959205269813538, "sampling/sampling_logp_difference/max": 0.6539740562438965, "sampling/sampling_logp_difference/mean": 0.014015990309417248, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35728025436401367, "epoch": 2.355392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.02601540431378326, "kl": 0.029771855100989342, "learning_rate": 1.3590604870959043e-07, "loss": 0.0003, "num_tokens": 60735917.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5385515689849854, "sampling/importance_sampling_ratio/mean": 0.9996775388717651, "sampling/importance_sampling_ratio/min": 0.6397542357444763, "sampling/sampling_logp_difference/max": 0.4466712474822998, "sampling/sampling_logp_difference/mean": 0.014768035151064396, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 189.765625, "completions/mean_terminated_length": 189.765625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3744673728942871, "epoch": 2.3566176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 1.0277436056638698, "kl": 0.02428172156214714, "learning_rate": 1.3541816965243462e-07, "loss": 0.0418, "num_tokens": 60769806.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6652601957321167, "sampling/importance_sampling_ratio/mean": 1.000030517578125, "sampling/importance_sampling_ratio/min": 0.618177056312561, "sampling/sampling_logp_difference/max": 0.5099813938140869, "sampling/sampling_logp_difference/mean": 0.013300665654242039, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 175.6875, "completions/mean_terminated_length": 175.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.348365843296051, "epoch": 2.357843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.024166011267922807, "kl": 0.026825370267033577, "learning_rate": 1.3493103067329737e-07, "loss": 0.0003, "num_tokens": 60799354.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5479344129562378, "sampling/importance_sampling_ratio/mean": 0.9999157190322876, "sampling/importance_sampling_ratio/min": 0.6347072124481201, "sampling/sampling_logp_difference/max": 0.4545915126800537, "sampling/sampling_logp_difference/mean": 0.01401049830019474, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 218.171875, "completions/mean_terminated_length": 218.171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3539429306983948, "epoch": 2.3590686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.747960810606086, "kl": 0.03561093658208847, "learning_rate": 1.3444463276104012e-07, "loss": -0.0106, "num_tokens": 60832293.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4050774574279785, "sampling/importance_sampling_ratio/mean": 0.9996697902679443, "sampling/importance_sampling_ratio/min": 0.6177712678909302, "sampling/sampling_logp_difference/max": 0.48163700103759766, "sampling/sampling_logp_difference/mean": 0.014360794797539711, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 216.046875, "completions/mean_terminated_length": 216.046875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3854154944419861, "epoch": 2.360294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.9748618910236813, "kl": 0.024287715554237366, "learning_rate": 1.3395897690301966e-07, "loss": 0.0918, "num_tokens": 60866616.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.9118987321853638, "sampling/importance_sampling_ratio/mean": 1.0000795125961304, "sampling/importance_sampling_ratio/min": 0.6256084442138672, "sampling/sampling_logp_difference/max": 0.6480967998504639, "sampling/sampling_logp_difference/mean": 0.013329725712537766, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 231.546875, "completions/mean_terminated_length": 231.546875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3768054246902466, "epoch": 2.361519607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.013843168965082383, "kl": 0.019867606461048126, "learning_rate": 1.3347406408508694e-07, "loss": 0.0002, "num_tokens": 60897451.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.424190878868103, "sampling/importance_sampling_ratio/mean": 1.0000393390655518, "sampling/importance_sampling_ratio/min": 0.684594452381134, "sampling/sampling_logp_difference/max": 0.37892866134643555, "sampling/sampling_logp_difference/mean": 0.014006221666932106, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 226.34375, "completions/mean_terminated_length": 226.34375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.40192288160324097, "epoch": 2.3627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7252619919133455, "kl": 0.0270424522459507, "learning_rate": 1.3298989529158378e-07, "loss": 0.0071, "num_tokens": 60932769.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6022827625274658, "sampling/importance_sampling_ratio/mean": 1.000500202178955, "sampling/importance_sampling_ratio/min": 0.6042662858963013, "sampling/sampling_logp_difference/max": 0.5037403106689453, "sampling/sampling_logp_difference/mean": 0.014340700581669807, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 160.171875, "completions/mean_terminated_length": 160.171875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.29812660813331604, "epoch": 2.363970588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.9414259803642635, "kl": 0.03467084467411041, "learning_rate": 1.325064715053425e-07, "loss": -0.0398, "num_tokens": 60957116.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6463149785995483, "sampling/importance_sampling_ratio/mean": 1.0005197525024414, "sampling/importance_sampling_ratio/min": 0.6299277544021606, "sampling/sampling_logp_difference/max": 0.49853944778442383, "sampling/sampling_logp_difference/mean": 0.013119611889123917, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 223.46875, "completions/mean_terminated_length": 223.46875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.47285810112953186, "epoch": 2.3651960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.8889440291759665, "kl": 0.036731865257024765, "learning_rate": 1.320237937076825e-07, "loss": -0.0301, "num_tokens": 60989866.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3725612163543701, "sampling/importance_sampling_ratio/mean": 1.0006674528121948, "sampling/importance_sampling_ratio/min": 0.5886189937591553, "sampling/sampling_logp_difference/max": 0.5299761891365051, "sampling/sampling_logp_difference/mean": 0.01706557720899582, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.38296884298324585, "epoch": 2.366421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.020030366923427478, "kl": 0.03225460276007652, "learning_rate": 1.3154186287840946e-07, "loss": 0.0003, "num_tokens": 61021174.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6649678945541382, "sampling/importance_sampling_ratio/mean": 0.9999372959136963, "sampling/importance_sampling_ratio/min": 0.5696814060211182, "sampling/sampling_logp_difference/max": 0.5626779794692993, "sampling/sampling_logp_difference/mean": 0.016994740813970566, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 185.484375, "completions/mean_terminated_length": 185.484375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.44133448600769043, "epoch": 2.3676470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.6970894685886355, "kl": 0.07990291714668274, "learning_rate": 1.310606799958122e-07, "loss": -0.0044, "num_tokens": 61054853.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3333005905151367, "sampling/importance_sampling_ratio/mean": 1.0000016689300537, "sampling/importance_sampling_ratio/min": 0.6799634695053101, "sampling/sampling_logp_difference/max": 0.38571619987487793, "sampling/sampling_logp_difference/mean": 0.01756683550775051, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 230.75, "completions/mean_terminated_length": 230.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.40245676040649414, "epoch": 2.368872549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0135021678291646, "kl": 0.024305783212184906, "learning_rate": 1.305802460366615e-07, "loss": 0.0002, "num_tokens": 61094357.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5206624269485474, "sampling/importance_sampling_ratio/mean": 0.9997435808181763, "sampling/importance_sampling_ratio/min": 0.6622399091720581, "sampling/sampling_logp_difference/max": 0.4191460609436035, "sampling/sampling_logp_difference/mean": 0.015357659198343754, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 175.859375, "completions/mean_terminated_length": 175.859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3690338730812073, "epoch": 2.3700980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.204549270967772, "kl": 0.026068702340126038, "learning_rate": 1.3010056197620812e-07, "loss": 0.0464, "num_tokens": 61125388.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5744441747665405, "sampling/importance_sampling_ratio/mean": 1.0006462335586548, "sampling/importance_sampling_ratio/min": 0.5334433913230896, "sampling/sampling_logp_difference/max": 0.6284023523330688, "sampling/sampling_logp_difference/mean": 0.015417314134538174, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4958120584487915, "epoch": 2.3713235294117645, "frac_reward_zero_std": 0.5, "grad_norm": 1.2295254402215419, "kl": 0.05303645133972168, "learning_rate": 1.2962162878817985e-07, "loss": -0.014, "num_tokens": 61161524.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.396849274635315, "sampling/importance_sampling_ratio/mean": 1.000116229057312, "sampling/importance_sampling_ratio/min": 0.7317201495170593, "sampling/sampling_logp_difference/max": 0.33421921730041504, "sampling/sampling_logp_difference/mean": 0.016872648149728775, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.37191271781921387, "epoch": 2.372549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.015527941135002886, "kl": 0.02844407968223095, "learning_rate": 1.2914344744478112e-07, "loss": 0.0003, "num_tokens": 61194756.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6271814107894897, "sampling/importance_sampling_ratio/mean": 0.9991934895515442, "sampling/importance_sampling_ratio/min": 0.6379886865615845, "sampling/sampling_logp_difference/max": 0.486849308013916, "sampling/sampling_logp_difference/mean": 0.01516721025109291, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 188.734375, "completions/mean_terminated_length": 188.734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.346946656703949, "epoch": 2.373774509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.9941853943369408, "kl": 0.03440224006772041, "learning_rate": 1.2866601891668942e-07, "loss": 0.0366, "num_tokens": 61223107.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.666730284690857, "sampling/importance_sampling_ratio/mean": 1.0004414319992065, "sampling/importance_sampling_ratio/min": 0.5941352844238281, "sampling/sampling_logp_difference/max": 0.5206482410430908, "sampling/sampling_logp_difference/mean": 0.01369963027536869, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3266833424568176, "epoch": 2.375, "frac_reward_zero_std": 1.0, "grad_norm": 0.02055707743919508, "kl": 0.023911627009510994, "learning_rate": 1.2818934417305477e-07, "loss": 0.0002, "num_tokens": 61250619.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3993620872497559, "sampling/importance_sampling_ratio/mean": 0.9997545480728149, "sampling/importance_sampling_ratio/min": 0.17980583012104034, "sampling/sampling_logp_difference/max": 1.7158777713775635, "sampling/sampling_logp_difference/mean": 0.014155317097902298, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 235.515625, "completions/mean_terminated_length": 235.515625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.36854732036590576, "epoch": 2.376225490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.020321860124814524, "kl": 0.030030731111764908, "learning_rate": 1.2771342418149656e-07, "loss": 0.0003, "num_tokens": 61286748.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.633960485458374, "sampling/importance_sampling_ratio/mean": 1.0005673170089722, "sampling/importance_sampling_ratio/min": 0.6923255324363708, "sampling/sampling_logp_difference/max": 0.49100685119628906, "sampling/sampling_logp_difference/mean": 0.015137957409024239, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 227.84375, "completions/mean_terminated_length": 227.84375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.5019451379776001, "epoch": 2.377450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.7251780945530559, "kl": 0.06397197395563126, "learning_rate": 1.2723825990810204e-07, "loss": 0.0009, "num_tokens": 61320034.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.642146348953247, "sampling/importance_sampling_ratio/mean": 1.0002970695495605, "sampling/importance_sampling_ratio/min": 0.6581567525863647, "sampling/sampling_logp_difference/max": 0.4960041046142578, "sampling/sampling_logp_difference/mean": 0.016976214945316315, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 176.171875, "completions/mean_terminated_length": 176.171875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.273092657327652, "epoch": 2.3786764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01880395412207116, "kl": 0.021814711391925812, "learning_rate": 1.2676385231742494e-07, "loss": 0.0002, "num_tokens": 61347853.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5767922401428223, "sampling/importance_sampling_ratio/mean": 1.0000545978546143, "sampling/importance_sampling_ratio/min": 0.5499488711357117, "sampling/sampling_logp_difference/max": 0.5979299545288086, "sampling/sampling_logp_difference/mean": 0.012390416115522385, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 213.078125, "completions/mean_terminated_length": 213.078125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.31701821088790894, "epoch": 2.3799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8289956829635279, "kl": 0.019230041652917862, "learning_rate": 1.262902023724824e-07, "loss": 0.0692, "num_tokens": 61378162.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5289256572723389, "sampling/importance_sampling_ratio/mean": 1.000627040863037, "sampling/importance_sampling_ratio/min": 0.6622400879859924, "sampling/sampling_logp_difference/max": 0.42456531524658203, "sampling/sampling_logp_difference/mean": 0.011816595681011677, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 217.6875, "completions/mean_terminated_length": 217.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4216040372848511, "epoch": 2.381127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.7773989763178522, "kl": 0.03368686884641647, "learning_rate": 1.258173110347538e-07, "loss": -0.0106, "num_tokens": 61421262.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5190861225128174, "sampling/importance_sampling_ratio/mean": 1.000107765197754, "sampling/importance_sampling_ratio/min": 0.5889346599578857, "sampling/sampling_logp_difference/max": 0.5294401049613953, "sampling/sampling_logp_difference/mean": 0.014785869047045708, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 242.015625, "completions/mean_terminated_length": 242.015625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.34563833475112915, "epoch": 2.3823529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 0.9038419975369213, "kl": 0.022247061133384705, "learning_rate": 1.253451792641785e-07, "loss": -0.0065, "num_tokens": 61456047.0, "reward": 0.09375, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000001192092896, "sampling/importance_sampling_ratio/min": 0.6332416534423828, "sampling/sampling_logp_difference/max": 0.8587944507598877, "sampling/sampling_logp_difference/mean": 0.012755580246448517, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 177.453125, "completions/mean_terminated_length": 177.453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.443953275680542, "epoch": 2.383578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.3382090616515316, "kl": 0.042421214282512665, "learning_rate": 1.248738080191543e-07, "loss": 0.0166, "num_tokens": 61482300.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.631811499595642, "sampling/importance_sampling_ratio/mean": 0.9994724988937378, "sampling/importance_sampling_ratio/min": 0.13601350784301758, "sampling/sampling_logp_difference/max": 1.9950010776519775, "sampling/sampling_logp_difference/mean": 0.01710696518421173, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 172.765625, "completions/mean_terminated_length": 172.765625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.35881149768829346, "epoch": 2.3848039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02606381030151454, "kl": 0.03052517957985401, "learning_rate": 1.244031982565349e-07, "loss": 0.0003, "num_tokens": 61507949.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4016011953353882, "sampling/importance_sampling_ratio/mean": 0.9998782873153687, "sampling/importance_sampling_ratio/min": 0.6277007460594177, "sampling/sampling_logp_difference/max": 0.46569180488586426, "sampling/sampling_logp_difference/mean": 0.01494034007191658, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 213.28125, "completions/mean_terminated_length": 213.28125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.529669463634491, "epoch": 2.386029411764706, "frac_reward_zero_std": 0.25, "grad_norm": 1.5369391712282672, "kl": 0.10564006865024567, "learning_rate": 1.239333509316281e-07, "loss": -0.0258, "num_tokens": 61542255.0, "reward": 0.40625, "reward_std": 0.6205305457115173, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6018822193145752, "sampling/importance_sampling_ratio/mean": 1.0002837181091309, "sampling/importance_sampling_ratio/min": 0.655114471912384, "sampling/sampling_logp_difference/max": 0.4711792469024658, "sampling/sampling_logp_difference/mean": 0.018138162791728973, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 183.921875, "completions/mean_terminated_length": 183.921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3101119101047516, "epoch": 2.3872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.014355034561366454, "kl": 0.023137640208005905, "learning_rate": 1.2346426699819456e-07, "loss": 0.0002, "num_tokens": 61572538.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5019768476486206, "sampling/importance_sampling_ratio/mean": 0.9999720454216003, "sampling/importance_sampling_ratio/min": 0.6942974328994751, "sampling/sampling_logp_difference/max": 0.4067821502685547, "sampling/sampling_logp_difference/mean": 0.01327432505786419, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 181.4375, "completions/mean_terminated_length": 181.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3148180842399597, "epoch": 2.388480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.01795075784860789, "kl": 0.024346178397536278, "learning_rate": 1.2299594740844476e-07, "loss": 0.0002, "num_tokens": 61601030.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5028719902038574, "sampling/importance_sampling_ratio/mean": 0.9996898174285889, "sampling/importance_sampling_ratio/min": 0.6907265186309814, "sampling/sampling_logp_difference/max": 0.4073779582977295, "sampling/sampling_logp_difference/mean": 0.013319061137735844, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 152.546875, "completions/mean_terminated_length": 152.546875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3306979238986969, "epoch": 2.389705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.016788307033416612, "kl": 0.02332313545048237, "learning_rate": 1.225283931130378e-07, "loss": 0.0002, "num_tokens": 61625673.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5292854309082031, "sampling/importance_sampling_ratio/mean": 1.0003693103790283, "sampling/importance_sampling_ratio/min": 0.6097698211669922, "sampling/sampling_logp_difference/max": 0.4946737289428711, "sampling/sampling_logp_difference/mean": 0.01440692599862814, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.37355828285217285, "epoch": 2.3909313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.6250700743102716, "kl": 0.04412819445133209, "learning_rate": 1.220616050610791e-07, "loss": 0.014, "num_tokens": 61652585.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3118984699249268, "sampling/importance_sampling_ratio/mean": 0.9995921850204468, "sampling/importance_sampling_ratio/min": 0.6029638648033142, "sampling/sampling_logp_difference/max": 0.5058979988098145, "sampling/sampling_logp_difference/mean": 0.015270160511136055, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 198.828125, "completions/mean_terminated_length": 198.828125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.35545986890792847, "epoch": 2.392156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.8277295072089057, "kl": 0.0282914862036705, "learning_rate": 1.2159558420011905e-07, "loss": 0.0111, "num_tokens": 61683470.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.622059941291809, "sampling/importance_sampling_ratio/mean": 1.000211477279663, "sampling/importance_sampling_ratio/min": 0.6415322422981262, "sampling/sampling_logp_difference/max": 0.48369693756103516, "sampling/sampling_logp_difference/mean": 0.014084463939070702, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 261.859375, "completions/mean_terminated_length": 261.859375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.32786688208580017, "epoch": 2.3933823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01641777539709697, "kl": 0.021822992712259293, "learning_rate": 1.2113033147615071e-07, "loss": 0.0002, "num_tokens": 61715029.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.475993275642395, "sampling/importance_sampling_ratio/mean": 1.0002014636993408, "sampling/importance_sampling_ratio/min": 0.6106759309768677, "sampling/sampling_logp_difference/max": 0.49318885803222656, "sampling/sampling_logp_difference/mean": 0.01305415015667677, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 184.328125, "completions/mean_terminated_length": 184.328125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.32844293117523193, "epoch": 2.394607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.014920561640594215, "kl": 0.020569054409861565, "learning_rate": 1.206658478336071e-07, "loss": 0.0002, "num_tokens": 61743674.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007356643676758, "sampling/importance_sampling_ratio/mean": 1.0000507831573486, "sampling/importance_sampling_ratio/min": 0.685139000415802, "sampling/sampling_logp_difference/max": 0.47046327590942383, "sampling/sampling_logp_difference/mean": 0.01349298283457756, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 164.203125, "completions/mean_terminated_length": 164.203125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.326759397983551, "epoch": 2.3958333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.02062552066755512, "kl": 0.02400572970509529, "learning_rate": 1.2020213421536103e-07, "loss": 0.0002, "num_tokens": 61771143.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.405951738357544, "sampling/importance_sampling_ratio/mean": 0.9998986721038818, "sampling/importance_sampling_ratio/min": 0.6610286831855774, "sampling/sampling_logp_difference/max": 0.4139580726623535, "sampling/sampling_logp_difference/mean": 0.013679726049304008, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 197.171875, "completions/mean_terminated_length": 197.171875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.48854365944862366, "epoch": 2.3970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.03660998181636321, "kl": 0.04842720180749893, "learning_rate": 1.1973919156272138e-07, "loss": 0.0005, "num_tokens": 61807042.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4974969625473022, "sampling/importance_sampling_ratio/mean": 1.0001397132873535, "sampling/importance_sampling_ratio/min": 0.6678784489631653, "sampling/sampling_logp_difference/max": 0.4037950038909912, "sampling/sampling_logp_difference/mean": 0.01671508140861988, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 234.046875, "completions/mean_terminated_length": 234.046875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.37770476937294006, "epoch": 2.3982843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7912677326600676, "kl": 0.025726042687892914, "learning_rate": 1.1927702081543278e-07, "loss": 0.0028, "num_tokens": 61840373.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4759455919265747, "sampling/importance_sampling_ratio/mean": 0.9998796582221985, "sampling/importance_sampling_ratio/min": 0.6622360944747925, "sampling/sampling_logp_difference/max": 0.4121330976486206, "sampling/sampling_logp_difference/mean": 0.01440503355115652, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 210.703125, "completions/mean_terminated_length": 210.703125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.44456803798675537, "epoch": 2.3995098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.012359104355091003, "kl": 0.02415592223405838, "learning_rate": 1.188156229116724e-07, "loss": 0.0002, "num_tokens": 61881938.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.430909514427185, "sampling/importance_sampling_ratio/mean": 0.9997051954269409, "sampling/importance_sampling_ratio/min": 0.6149290204048157, "sampling/sampling_logp_difference/max": 0.4862484931945801, "sampling/sampling_logp_difference/mean": 0.016197897493839264, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 197.890625, "completions/mean_terminated_length": 197.890625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3556378185749054, "epoch": 2.400735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7852415609725907, "kl": 0.035944290459156036, "learning_rate": 1.1835499878804861e-07, "loss": -0.0092, "num_tokens": 61912859.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.423384666442871, "sampling/importance_sampling_ratio/mean": 0.99934983253479, "sampling/importance_sampling_ratio/min": 0.6411640644073486, "sampling/sampling_logp_difference/max": 0.4444699287414551, "sampling/sampling_logp_difference/mean": 0.013170383870601654, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 164.453125, "completions/mean_terminated_length": 164.453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4567108750343323, "epoch": 2.4019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.027640789248631354, "kl": 0.045468587428331375, "learning_rate": 1.1789514937959965e-07, "loss": 0.0005, "num_tokens": 61938680.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.500498652458191, "sampling/importance_sampling_ratio/mean": 1.0003533363342285, "sampling/importance_sampling_ratio/min": 0.6302643418312073, "sampling/sampling_logp_difference/max": 0.46161603927612305, "sampling/sampling_logp_difference/mean": 0.01733510196208954, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 187.921875, "completions/mean_terminated_length": 187.921875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4292413890361786, "epoch": 2.403186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.026728092279935153, "kl": 0.039078257977962494, "learning_rate": 1.1743607561979013e-07, "loss": 0.0004, "num_tokens": 61970131.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4552148580551147, "sampling/importance_sampling_ratio/mean": 1.0002341270446777, "sampling/importance_sampling_ratio/min": 0.7300223112106323, "sampling/sampling_logp_difference/max": 0.3751535415649414, "sampling/sampling_logp_difference/mean": 0.01613743230700493, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 151.34375, "completions/mean_terminated_length": 151.34375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.37262940406799316, "epoch": 2.4044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.021750435024995473, "kl": 0.03221053257584572, "learning_rate": 1.1697777844051104e-07, "loss": 0.0003, "num_tokens": 61996681.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.349316954612732, "sampling/importance_sampling_ratio/mean": 0.9994165897369385, "sampling/importance_sampling_ratio/min": 0.61583012342453, "sampling/sampling_logp_difference/max": 0.4847841262817383, "sampling/sampling_logp_difference/mean": 0.015558486804366112, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.41379979252815247, "epoch": 2.405637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.905779788821325, "kl": 0.03606260195374489, "learning_rate": 1.1652025877207644e-07, "loss": 0.0066, "num_tokens": 62027289.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.319873571395874, "sampling/importance_sampling_ratio/mean": 1.000108003616333, "sampling/importance_sampling_ratio/min": 0.6361140608787537, "sampling/sampling_logp_difference/max": 0.4523773193359375, "sampling/sampling_logp_difference/mean": 0.014538638293743134, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 157.078125, "completions/mean_terminated_length": 157.078125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3212488293647766, "epoch": 2.406862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.017227515515825284, "kl": 0.02447613701224327, "learning_rate": 1.1606351754322247e-07, "loss": 0.0002, "num_tokens": 62052430.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5025979280471802, "sampling/importance_sampling_ratio/mean": 1.0008751153945923, "sampling/importance_sampling_ratio/min": 0.7386578917503357, "sampling/sampling_logp_difference/max": 0.4071955680847168, "sampling/sampling_logp_difference/mean": 0.013703729957342148, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 200.953125, "completions/mean_terminated_length": 200.953125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4697273373603821, "epoch": 2.4080882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.7855249261291434, "kl": 0.05025696009397507, "learning_rate": 1.156075556811048e-07, "loss": -0.0114, "num_tokens": 62084747.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4986478090286255, "sampling/importance_sampling_ratio/mean": 1.0002055168151855, "sampling/importance_sampling_ratio/min": 0.6056578159332275, "sampling/sampling_logp_difference/max": 0.5014400482177734, "sampling/sampling_logp_difference/mean": 0.017576631158590317, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 170.234375, "completions/mean_terminated_length": 170.234375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.41129642724990845, "epoch": 2.409313725490196, "frac_reward_zero_std": 0.5, "grad_norm": 1.4039440656830176, "kl": 0.034906573593616486, "learning_rate": 1.1515237411129697e-07, "loss": 0.0076, "num_tokens": 62115898.0, "reward": -0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4529551267623901, "sampling/importance_sampling_ratio/mean": 0.9999223947525024, "sampling/importance_sampling_ratio/min": 0.6937439441680908, "sampling/sampling_logp_difference/max": 0.3735995292663574, "sampling/sampling_logp_difference/mean": 0.015991121530532837, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 170.40625, "completions/mean_terminated_length": 170.40625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3708356022834778, "epoch": 2.4105392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8072169240429856, "kl": 0.05858634412288666, "learning_rate": 1.1469797375778901e-07, "loss": -0.016, "num_tokens": 62139604.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999583959579468, "sampling/importance_sampling_ratio/min": 0.6091251373291016, "sampling/sampling_logp_difference/max": 1.2542685270309448, "sampling/sampling_logp_difference/mean": 0.016308307647705078, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 221.3125, "completions/mean_terminated_length": 221.3125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.41176238656044006, "epoch": 2.411764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.6709462036295988, "kl": 0.042348649352788925, "learning_rate": 1.1424435554298473e-07, "loss": 0.0266, "num_tokens": 62175224.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.2956719398498535, "sampling/importance_sampling_ratio/mean": 0.9998412132263184, "sampling/importance_sampling_ratio/min": 0.6296241879463196, "sampling/sampling_logp_difference/max": 0.4626321792602539, "sampling/sampling_logp_difference/mean": 0.014581560157239437, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 237.1875, "completions/mean_terminated_length": 237.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.42337876558303833, "epoch": 2.4129901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7657342226561513, "kl": 0.0363505557179451, "learning_rate": 1.1379152038770029e-07, "loss": 0.0135, "num_tokens": 62211652.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.364220142364502, "sampling/importance_sampling_ratio/mean": 0.9999942183494568, "sampling/importance_sampling_ratio/min": 0.657956600189209, "sampling/sampling_logp_difference/max": 0.41861629486083984, "sampling/sampling_logp_difference/mean": 0.014614572748541832, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 221.296875, "completions/mean_terminated_length": 221.296875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.42439010739326477, "epoch": 2.4142156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.7412188463840695, "kl": 0.07795524597167969, "learning_rate": 1.1333946921116234e-07, "loss": 0.0112, "num_tokens": 62241911.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 0.9999186992645264, "sampling/importance_sampling_ratio/min": 0.6243884563446045, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.015674494206905365, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 191.453125, "completions/mean_terminated_length": 191.453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.45214176177978516, "epoch": 2.4154411764705883, "frac_reward_zero_std": 0.5, "grad_norm": 1.25250645981119, "kl": 0.0508386492729187, "learning_rate": 1.1288820293100637e-07, "loss": -0.0355, "num_tokens": 62273572.0, "reward": 0.375, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3050446510314941, "sampling/importance_sampling_ratio/mean": 1.0000816583633423, "sampling/importance_sampling_ratio/min": 0.6778684258460999, "sampling/sampling_logp_difference/max": 0.38880205154418945, "sampling/sampling_logp_difference/mean": 0.01609937846660614, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 178.53125, "completions/mean_terminated_length": 178.53125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3968961536884308, "epoch": 2.4166666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 1.1882204520855186, "kl": 0.026850782334804535, "learning_rate": 1.1243772246327415e-07, "loss": 0.0442, "num_tokens": 62304358.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.364865779876709, "sampling/importance_sampling_ratio/mean": 1.000064730644226, "sampling/importance_sampling_ratio/min": 0.6227121353149414, "sampling/sampling_logp_difference/max": 0.47367095947265625, "sampling/sampling_logp_difference/mean": 0.015055298805236816, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 227.171875, "completions/mean_terminated_length": 227.171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.45749881863594055, "epoch": 2.417892156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.117696947672644, "kl": 0.0721966102719307, "learning_rate": 1.1198802872241242e-07, "loss": 0.0116, "num_tokens": 62338257.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6658930778503418, "sampling/importance_sampling_ratio/mean": 1.0001027584075928, "sampling/importance_sampling_ratio/min": 0.4650120437145233, "sampling/sampling_logp_difference/max": 0.7656919956207275, "sampling/sampling_logp_difference/mean": 0.016596361994743347, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 127.234375, "completions/mean_terminated_length": 127.234375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.35489314794540405, "epoch": 2.4191176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.037266972411649196, "kl": 0.032278575003147125, "learning_rate": 1.1153912262127119e-07, "loss": 0.0003, "num_tokens": 62366480.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3228861093521118, "sampling/importance_sampling_ratio/mean": 0.9997625350952148, "sampling/importance_sampling_ratio/min": 0.6396545767784119, "sampling/sampling_logp_difference/max": 0.4468269348144531, "sampling/sampling_logp_difference/mean": 0.014400172978639603, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 179.515625, "completions/mean_terminated_length": 179.515625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3122207820415497, "epoch": 2.420343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.9409392365027711, "kl": 0.02874099276959896, "learning_rate": 1.1109100507110131e-07, "loss": -0.0084, "num_tokens": 62391761.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6055723428726196, "sampling/importance_sampling_ratio/mean": 1.0000977516174316, "sampling/importance_sampling_ratio/min": 0.7315836548805237, "sampling/sampling_logp_difference/max": 0.473480224609375, "sampling/sampling_logp_difference/mean": 0.01330866850912571, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.4175884425640106, "epoch": 2.4215686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.782914234652367, "kl": 0.05412402004003525, "learning_rate": 1.1064367698155303e-07, "loss": -0.0139, "num_tokens": 62426445.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5500508546829224, "sampling/importance_sampling_ratio/mean": 1.0003635883331299, "sampling/importance_sampling_ratio/min": 0.6406451463699341, "sampling/sampling_logp_difference/max": 0.445279598236084, "sampling/sampling_logp_difference/mean": 0.015453522093594074, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 180.59375, "completions/mean_terminated_length": 180.59375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3654729425907135, "epoch": 2.422794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.3398562328508408, "kl": 0.04450765997171402, "learning_rate": 1.1019713926067392e-07, "loss": -0.0163, "num_tokens": 62455875.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5971509218215942, "sampling/importance_sampling_ratio/mean": 1.0005977153778076, "sampling/importance_sampling_ratio/min": 0.617858350276947, "sampling/sampling_logp_difference/max": 0.48149609565734863, "sampling/sampling_logp_difference/mean": 0.01475649606436491, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 242.671875, "completions/mean_terminated_length": 242.671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.4201916456222534, "epoch": 2.424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7141283259830095, "kl": 0.03380756080150604, "learning_rate": 1.0975139281490747e-07, "loss": -0.0309, "num_tokens": 62490238.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7181060314178467, "sampling/importance_sampling_ratio/mean": 1.000054121017456, "sampling/importance_sampling_ratio/min": 0.631348192691803, "sampling/sampling_logp_difference/max": 0.5412225723266602, "sampling/sampling_logp_difference/mean": 0.015032166615128517, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 144.40625, "completions/mean_terminated_length": 144.40625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3454689383506775, "epoch": 2.4252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02095865564486938, "kl": 0.0612105168402195, "learning_rate": 1.093064385490906e-07, "loss": 0.0005, "num_tokens": 62513832.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6124627590179443, "sampling/importance_sampling_ratio/mean": 1.0005912780761719, "sampling/importance_sampling_ratio/min": 0.6428762674331665, "sampling/sampling_logp_difference/max": 0.47776269912719727, "sampling/sampling_logp_difference/mean": 0.014039501547813416, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 250.140625, "completions/mean_terminated_length": 250.140625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4830671548843384, "epoch": 2.426470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.8037283796562461, "kl": 0.07431580126285553, "learning_rate": 1.0886227736645215e-07, "loss": -0.0246, "num_tokens": 62552561.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001500844955444, "sampling/importance_sampling_ratio/min": 0.36815980076789856, "sampling/sampling_logp_difference/max": 0.9992382526397705, "sampling/sampling_logp_difference/mean": 0.016817739233374596, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 184.71875, "completions/mean_terminated_length": 184.71875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.44568321108818054, "epoch": 2.4276960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.4118583440996253, "kl": 0.06490681320428848, "learning_rate": 1.0841891016861155e-07, "loss": -0.0211, "num_tokens": 62584975.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5715563297271729, "sampling/importance_sampling_ratio/mean": 0.9995582699775696, "sampling/importance_sampling_ratio/min": 0.487101674079895, "sampling/sampling_logp_difference/max": 0.7192823886871338, "sampling/sampling_logp_difference/mean": 0.01702762395143509, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 218.234375, "completions/mean_terminated_length": 218.234375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.38632649183273315, "epoch": 2.428921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.017640183863070005, "kl": 0.0258384607732296, "learning_rate": 1.0797633785557581e-07, "loss": 0.0003, "num_tokens": 62620494.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4703099727630615, "sampling/importance_sampling_ratio/mean": 1.0003032684326172, "sampling/importance_sampling_ratio/min": 0.6110231280326843, "sampling/sampling_logp_difference/max": 0.49262046813964844, "sampling/sampling_logp_difference/mean": 0.014505396597087383, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 177.359375, "completions/mean_terminated_length": 177.359375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3513790965080261, "epoch": 2.4301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.018986955667485664, "kl": 0.02966098114848137, "learning_rate": 1.0753456132573885e-07, "loss": 0.0003, "num_tokens": 62652261.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4397295713424683, "sampling/importance_sampling_ratio/mean": 0.99994295835495, "sampling/importance_sampling_ratio/min": 0.6086769700050354, "sampling/sampling_logp_difference/max": 0.49646759033203125, "sampling/sampling_logp_difference/mean": 0.015134020708501339, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 198.703125, "completions/mean_terminated_length": 198.703125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3957688808441162, "epoch": 2.431372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.028757894653507546, "kl": 0.041085124015808105, "learning_rate": 1.0709358147587883e-07, "loss": 0.0005, "num_tokens": 62684642.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4350110292434692, "sampling/importance_sampling_ratio/mean": 1.0003424882888794, "sampling/importance_sampling_ratio/min": 0.6262711882591248, "sampling/sampling_logp_difference/max": 0.4679718017578125, "sampling/sampling_logp_difference/mean": 0.015146794728934765, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 240.71875, "completions/mean_terminated_length": 240.71875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.41249239444732666, "epoch": 2.4325980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7045451304959446, "kl": 0.027376476675271988, "learning_rate": 1.0665339920115718e-07, "loss": 0.0055, "num_tokens": 62717248.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4753891229629517, "sampling/importance_sampling_ratio/mean": 0.9999990463256836, "sampling/importance_sampling_ratio/min": 0.6014488935470581, "sampling/sampling_logp_difference/max": 0.5084137916564941, "sampling/sampling_logp_difference/mean": 0.015139452181756496, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 233.921875, "completions/mean_terminated_length": 233.921875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.5119118094444275, "epoch": 2.4338235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 1.0695061553494256, "kl": 0.044679585844278336, "learning_rate": 1.0621401539511587e-07, "loss": 0.0414, "num_tokens": 62755067.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4380325078964233, "sampling/importance_sampling_ratio/mean": 0.9998857975006104, "sampling/importance_sampling_ratio/min": 0.7264178395271301, "sampling/sampling_logp_difference/max": 0.36327576637268066, "sampling/sampling_logp_difference/mean": 0.016982797533273697, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 159.125, "completions/mean_terminated_length": 159.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.35807543992996216, "epoch": 2.435049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01767856583666848, "kl": 0.021159913390874863, "learning_rate": 1.0577543094967611e-07, "loss": 0.0002, "num_tokens": 62782771.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.425788402557373, "sampling/importance_sampling_ratio/mean": 0.9999739527702332, "sampling/importance_sampling_ratio/min": 0.6175535917282104, "sampling/sampling_logp_difference/max": 0.48198938369750977, "sampling/sampling_logp_difference/mean": 0.014918528497219086, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.36983782052993774, "epoch": 2.436274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.017714931439837617, "kl": 0.027224186807870865, "learning_rate": 1.053376467551368e-07, "loss": 0.0003, "num_tokens": 62812323.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3434499502182007, "sampling/importance_sampling_ratio/mean": 0.9995107054710388, "sampling/importance_sampling_ratio/min": 0.6668502688407898, "sampling/sampling_logp_difference/max": 0.40518975257873535, "sampling/sampling_logp_difference/mean": 0.014425843022763729, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 177.046875, "completions/mean_terminated_length": 177.046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3473891019821167, "epoch": 2.4375, "frac_reward_zero_std": 0.75, "grad_norm": 0.8341674998529726, "kl": 0.037632253021001816, "learning_rate": 1.0490066370017181e-07, "loss": 0.0272, "num_tokens": 62840534.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6463156938552856, "sampling/importance_sampling_ratio/mean": 0.9990817904472351, "sampling/importance_sampling_ratio/min": 0.6613803505897522, "sampling/sampling_logp_difference/max": 0.49853992462158203, "sampling/sampling_logp_difference/mean": 0.013913518749177456, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 203.328125, "completions/mean_terminated_length": 203.328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2740914821624756, "epoch": 2.438725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.01838227270240207, "kl": 0.023576756939291954, "learning_rate": 1.044644826718295e-07, "loss": 0.0002, "num_tokens": 62876235.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7334914207458496, "sampling/importance_sampling_ratio/mean": 1.000614881515503, "sampling/importance_sampling_ratio/min": 0.5954729914665222, "sampling/sampling_logp_difference/max": 0.5501375198364258, "sampling/sampling_logp_difference/mean": 0.01161906123161316, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 193.84375, "completions/mean_terminated_length": 193.84375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.36370471119880676, "epoch": 2.439950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9301842093189077, "kl": 0.026500212028622627, "learning_rate": 1.0402910455552916e-07, "loss": 0.003, "num_tokens": 62907009.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.441874384880066, "sampling/importance_sampling_ratio/mean": 0.999746561050415, "sampling/importance_sampling_ratio/min": 0.6290517449378967, "sampling/sampling_logp_difference/max": 0.46354174613952637, "sampling/sampling_logp_difference/mean": 0.015026746317744255, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.4158353805541992, "epoch": 2.4411764705882355, "frac_reward_zero_std": 0.5, "grad_norm": 1.1963761082120996, "kl": 0.05945902317762375, "learning_rate": 1.0359453023506121e-07, "loss": 0.0167, "num_tokens": 62934843.0, "reward": 0.0, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3790603876113892, "sampling/importance_sampling_ratio/mean": 0.9996902942657471, "sampling/importance_sampling_ratio/min": 0.7386338114738464, "sampling/sampling_logp_difference/max": 0.32140231132507324, "sampling/sampling_logp_difference/mean": 0.015041791833937168, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 178.296875, "completions/mean_terminated_length": 178.296875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.37770169973373413, "epoch": 2.4424019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7538945745100293, "kl": 0.026087483391165733, "learning_rate": 1.0316076059258389e-07, "loss": -0.0164, "num_tokens": 62963758.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4513964653015137, "sampling/importance_sampling_ratio/mean": 1.0002007484436035, "sampling/importance_sampling_ratio/min": 0.6932904124259949, "sampling/sampling_logp_difference/max": 0.3725261688232422, "sampling/sampling_logp_difference/mean": 0.014406517148017883, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 161.203125, "completions/mean_terminated_length": 161.203125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.34088271856307983, "epoch": 2.443627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.7765054386984444, "kl": 0.041137732565402985, "learning_rate": 1.0272779650862185e-07, "loss": -0.0014, "num_tokens": 62993723.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2942837476730347, "sampling/importance_sampling_ratio/mean": 0.9998880624771118, "sampling/importance_sampling_ratio/min": 0.6849300861358643, "sampling/sampling_logp_difference/max": 0.37843847274780273, "sampling/sampling_logp_difference/mean": 0.014076050370931625, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 184.90625, "completions/mean_terminated_length": 184.90625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.514701247215271, "epoch": 2.4448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9632573636638977, "kl": 0.039441876113414764, "learning_rate": 1.0229563886206516e-07, "loss": 0.0064, "num_tokens": 63024901.0, "reward": -0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": -0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3393256664276123, "sampling/importance_sampling_ratio/mean": 0.999917209148407, "sampling/importance_sampling_ratio/min": 0.6539126038551331, "sampling/sampling_logp_difference/max": 0.42478156089782715, "sampling/sampling_logp_difference/mean": 0.018140411004424095, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 185.96875, "completions/mean_terminated_length": 185.96875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3519733250141144, "epoch": 2.446078431372549, "frac_reward_zero_std": 0.75, "grad_norm": 0.963101684896797, "kl": 0.02421330101788044, "learning_rate": 1.0186428853016604e-07, "loss": 0.0044, "num_tokens": 63058179.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3225116729736328, "sampling/importance_sampling_ratio/mean": 1.0001866817474365, "sampling/importance_sampling_ratio/min": 0.6189565062522888, "sampling/sampling_logp_difference/max": 0.4797203540802002, "sampling/sampling_logp_difference/mean": 0.013560053892433643, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 173.796875, "completions/mean_terminated_length": 173.796875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.38833457231521606, "epoch": 2.4473039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.8695705363505262, "kl": 0.04325224086642265, "learning_rate": 1.0143374638853891e-07, "loss": -0.0217, "num_tokens": 63085302.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000678300857544, "sampling/importance_sampling_ratio/min": 0.47059664130210876, "sampling/sampling_logp_difference/max": 1.3454623222351074, "sampling/sampling_logp_difference/mean": 0.015392575412988663, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 183.9375, "completions/mean_terminated_length": 183.9375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.412697970867157, "epoch": 2.448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.7677827256530417, "kl": 0.057433705776929855, "learning_rate": 1.0100401331115638e-07, "loss": 0.012, "num_tokens": 63115026.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 1.00032377243042, "sampling/importance_sampling_ratio/min": 0.6919641494750977, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.015239194966852665, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 197.71875, "completions/mean_terminated_length": 197.71875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.45423510670661926, "epoch": 2.4497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.030658816526145053, "kl": 0.04963368922472, "learning_rate": 1.0057509017034977e-07, "loss": 0.0005, "num_tokens": 63145728.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.475376844406128, "sampling/importance_sampling_ratio/mean": 0.9997437000274658, "sampling/importance_sampling_ratio/min": 0.6212072968482971, "sampling/sampling_logp_difference/max": 0.4760904312133789, "sampling/sampling_logp_difference/mean": 0.01706961914896965, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 160.359375, "completions/mean_terminated_length": 160.359375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.34984079003334045, "epoch": 2.450980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.03902570883154942, "kl": 0.05480959266424179, "learning_rate": 1.001469778368057e-07, "loss": 0.0006, "num_tokens": 63171479.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002403259277344, "sampling/importance_sampling_ratio/min": 0.6890142560005188, "sampling/sampling_logp_difference/max": 1.0377509593963623, "sampling/sampling_logp_difference/mean": 0.014667082577943802, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 207.65625, "completions/mean_terminated_length": 207.65625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4067958891391754, "epoch": 2.452205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 1.150074053740817, "kl": 0.030066289007663727, "learning_rate": 9.971967717956531e-08, "loss": -0.0486, "num_tokens": 63212337.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4151383638381958, "sampling/importance_sampling_ratio/mean": 1.0000475645065308, "sampling/importance_sampling_ratio/min": 0.6594096422195435, "sampling/sampling_logp_difference/max": 0.41641032695770264, "sampling/sampling_logp_difference/mean": 0.015238635241985321, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 207.609375, "completions/mean_terminated_length": 207.609375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3727913200855255, "epoch": 2.4534313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.016721451945647513, "kl": 0.028154637664556503, "learning_rate": 9.929318906602174e-08, "loss": 0.0003, "num_tokens": 63241848.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3162541389465332, "sampling/importance_sampling_ratio/mean": 1.0001139640808105, "sampling/importance_sampling_ratio/min": 0.6552072167396545, "sampling/sampling_logp_difference/max": 0.42280375957489014, "sampling/sampling_logp_difference/mean": 0.013950793072581291, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 226.6875, "completions/mean_terminated_length": 226.6875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3771401047706604, "epoch": 2.454656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.014699440635072126, "kl": 0.02426079846918583, "learning_rate": 9.886751436191871e-08, "loss": 0.0002, "num_tokens": 63276148.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4930033683776855, "sampling/importance_sampling_ratio/mean": 1.000488519668579, "sampling/importance_sampling_ratio/min": 0.6954967975616455, "sampling/sampling_logp_difference/max": 0.400789737701416, "sampling/sampling_logp_difference/mean": 0.01370636560022831, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 179.84375, "completions/mean_terminated_length": 179.84375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3870733380317688, "epoch": 2.4558823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.9187841935687155, "kl": 0.05936279892921448, "learning_rate": 9.844265393134926e-08, "loss": 0.0108, "num_tokens": 63307226.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.797240972518921, "sampling/importance_sampling_ratio/mean": 1.0002281665802002, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.5862526893615723, "sampling/sampling_logp_difference/mean": 0.01534139271825552, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 175.421875, "completions/mean_terminated_length": 175.421875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.400735080242157, "epoch": 2.457107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.8288882489790806, "kl": 0.033195994794368744, "learning_rate": 9.801860863675266e-08, "loss": -0.0031, "num_tokens": 63338357.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5534627437591553, "sampling/importance_sampling_ratio/mean": 0.9995038509368896, "sampling/importance_sampling_ratio/min": 0.6670553684234619, "sampling/sampling_logp_difference/max": 0.44048643112182617, "sampling/sampling_logp_difference/mean": 0.01500217616558075, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 206.5, "completions/mean_terminated_length": 206.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.43173086643218994, "epoch": 2.4583333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.14602123466493, "kl": 0.05417206510901451, "learning_rate": 9.759537933891421e-08, "loss": -0.0255, "num_tokens": 63367765.0, "reward": 0.59375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4411839246749878, "sampling/importance_sampling_ratio/mean": 0.9996616244316101, "sampling/importance_sampling_ratio/min": 0.6482206583023071, "sampling/sampling_logp_difference/max": 0.43352413177490234, "sampling/sampling_logp_difference/mean": 0.016676776111125946, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 180.078125, "completions/mean_terminated_length": 180.078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.45597320795059204, "epoch": 2.4595588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.048256222930180066, "kl": 0.11169981211423874, "learning_rate": 9.71729668969628e-08, "loss": 0.0011, "num_tokens": 63397258.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998218417167664, "sampling/importance_sampling_ratio/min": 0.6490694880485535, "sampling/sampling_logp_difference/max": 0.7834055423736572, "sampling/sampling_logp_difference/mean": 0.017066188156604767, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4196290969848633, "epoch": 2.4607843137254903, "frac_reward_zero_std": 0.5, "grad_norm": 1.2622915347290284, "kl": 0.05284261703491211, "learning_rate": 9.67513721683687e-08, "loss": 0.0063, "num_tokens": 63423918.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5551180839538574, "sampling/importance_sampling_ratio/mean": 0.9994953274726868, "sampling/importance_sampling_ratio/min": 0.6151927709579468, "sampling/sampling_logp_difference/max": 0.48581957817077637, "sampling/sampling_logp_difference/mean": 0.015804724767804146, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 184.03125, "completions/mean_terminated_length": 184.03125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3441496193408966, "epoch": 2.4620098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.018857326764506976, "kl": 0.02470124140381813, "learning_rate": 9.633059600894256e-08, "loss": 0.0002, "num_tokens": 63460752.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007341146469116, "sampling/importance_sampling_ratio/mean": 0.999882698059082, "sampling/importance_sampling_ratio/min": 0.6460320949554443, "sampling/sampling_logp_difference/max": 0.4704623222351074, "sampling/sampling_logp_difference/mean": 0.013898389413952827, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 175.40625, "completions/mean_terminated_length": 175.40625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.37059569358825684, "epoch": 2.463235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.020059675314963523, "kl": 0.02793136239051819, "learning_rate": 9.59106392728331e-08, "loss": 0.0003, "num_tokens": 63491914.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.878018856048584, "sampling/importance_sampling_ratio/mean": 0.9999091625213623, "sampling/importance_sampling_ratio/min": 0.730019211769104, "sampling/sampling_logp_difference/max": 0.630217432975769, "sampling/sampling_logp_difference/mean": 0.014268193393945694, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 191.046875, "completions/mean_terminated_length": 191.046875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3454241156578064, "epoch": 2.4644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.01668403844722609, "kl": 0.02659863792359829, "learning_rate": 9.549150281252632e-08, "loss": 0.0003, "num_tokens": 63526301.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4916746616363525, "sampling/importance_sampling_ratio/mean": 0.9998441934585571, "sampling/importance_sampling_ratio/min": 0.4908023476600647, "sampling/sampling_logp_difference/max": 0.7117137908935547, "sampling/sampling_logp_difference/mean": 0.014071143232285976, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 224.015625, "completions/mean_terminated_length": 224.015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.45142269134521484, "epoch": 2.465686274509804, "frac_reward_zero_std": 0.5, "grad_norm": 0.9943780380033651, "kl": 0.053271468728780746, "learning_rate": 9.507318747884241e-08, "loss": -0.0399, "num_tokens": 63560734.0, "reward": 0.125, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.9499462842941284, "sampling/importance_sampling_ratio/mean": 0.9997901320457458, "sampling/importance_sampling_ratio/min": 0.6057900190353394, "sampling/sampling_logp_difference/max": 0.6678018569946289, "sampling/sampling_logp_difference/mean": 0.015925079584121704, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 204.546875, "completions/mean_terminated_length": 204.546875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.38158512115478516, "epoch": 2.4669117647058822, "frac_reward_zero_std": 0.5, "grad_norm": 1.199859856240969, "kl": 0.049633897840976715, "learning_rate": 9.465569412093488e-08, "loss": 0.0155, "num_tokens": 63589393.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4727635383605957, "sampling/importance_sampling_ratio/mean": 0.9995374083518982, "sampling/importance_sampling_ratio/min": 0.624771773815155, "sampling/sampling_logp_difference/max": 0.4703688621520996, "sampling/sampling_logp_difference/mean": 0.014872990548610687, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 233.03125, "completions/mean_terminated_length": 233.03125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3444235324859619, "epoch": 2.468137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.6450744790665558, "kl": 0.026743315160274506, "learning_rate": 9.423902358628916e-08, "loss": 0.0071, "num_tokens": 63629219.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5467276573181152, "sampling/importance_sampling_ratio/mean": 1.0003172159194946, "sampling/importance_sampling_ratio/min": 0.6140531897544861, "sampling/sampling_logp_difference/max": 0.4876737594604492, "sampling/sampling_logp_difference/mean": 0.013777516782283783, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 201.46875, "completions/mean_terminated_length": 201.46875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3421174883842468, "epoch": 2.469362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.8465805260813877, "kl": 0.03840591758489609, "learning_rate": 9.382317672071966e-08, "loss": 0.0088, "num_tokens": 63655601.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5356110334396362, "sampling/importance_sampling_ratio/mean": 0.9997233152389526, "sampling/importance_sampling_ratio/min": 0.6057863831520081, "sampling/sampling_logp_difference/max": 0.501227855682373, "sampling/sampling_logp_difference/mean": 0.01357186958193779, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.42109107971191406, "epoch": 2.4705882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.0802098526469588, "kl": 0.035182803869247437, "learning_rate": 9.340815436836963e-08, "loss": -0.0328, "num_tokens": 63688569.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6001441478729248, "sampling/importance_sampling_ratio/mean": 0.9997341632843018, "sampling/importance_sampling_ratio/min": 0.4194274842739105, "sampling/sampling_logp_difference/max": 0.8688646554946899, "sampling/sampling_logp_difference/mean": 0.016411174088716507, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 258.828125, "completions/mean_terminated_length": 258.828125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3673095703125, "epoch": 2.471813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.7011740149640728, "kl": 0.024489955976605415, "learning_rate": 9.299395737170757e-08, "loss": -0.0232, "num_tokens": 63722350.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4139859676361084, "sampling/importance_sampling_ratio/mean": 0.9997345805168152, "sampling/importance_sampling_ratio/min": 0.6033496260643005, "sampling/sampling_logp_difference/max": 0.5052584409713745, "sampling/sampling_logp_difference/mean": 0.013786962255835533, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.34036415815353394, "epoch": 2.4730392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.023419569378104955, "kl": 0.025814056396484375, "learning_rate": 9.258058657152761e-08, "loss": 0.0003, "num_tokens": 63749866.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5875335931777954, "sampling/importance_sampling_ratio/mean": 1.0000228881835938, "sampling/importance_sampling_ratio/min": 0.641847550868988, "sampling/sampling_logp_difference/max": 0.46218156814575195, "sampling/sampling_logp_difference/mean": 0.014968582428991795, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 207.390625, "completions/mean_terminated_length": 207.390625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37217777967453003, "epoch": 2.474264705882353, "frac_reward_zero_std": 0.5, "grad_norm": 1.3054132438275574, "kl": 0.035565003752708435, "learning_rate": 9.216804280694612e-08, "loss": -0.0938, "num_tokens": 63779571.0, "reward": -0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6907960176467896, "sampling/importance_sampling_ratio/mean": 1.000901460647583, "sampling/importance_sampling_ratio/min": 0.6325021386146545, "sampling/sampling_logp_difference/max": 0.5251994132995605, "sampling/sampling_logp_difference/mean": 0.015001806430518627, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3852364122867584, "epoch": 2.4754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.016064194865100398, "kl": 0.02930980920791626, "learning_rate": 9.175632691540064e-08, "loss": 0.0003, "num_tokens": 63813435.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5754631757736206, "sampling/importance_sampling_ratio/mean": 1.0003750324249268, "sampling/importance_sampling_ratio/min": 0.6259395480155945, "sampling/sampling_logp_difference/max": 0.46850156784057617, "sampling/sampling_logp_difference/mean": 0.014602059498429298, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3446516692638397, "epoch": 2.4767156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 1.0883625505459922, "kl": 0.04013896360993385, "learning_rate": 9.134543973264868e-08, "loss": 0.0209, "num_tokens": 63833261.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5031681060791016, "sampling/importance_sampling_ratio/mean": 1.0001024007797241, "sampling/importance_sampling_ratio/min": 0.6957728862762451, "sampling/sampling_logp_difference/max": 0.40757501125335693, "sampling/sampling_logp_difference/mean": 0.015270713716745377, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 224.078125, "completions/mean_terminated_length": 224.078125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.4325482249259949, "epoch": 2.4779411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.7694334140558714, "kl": 0.06226831674575806, "learning_rate": 9.093538209276486e-08, "loss": -0.0138, "num_tokens": 63863330.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3806134462356567, "sampling/importance_sampling_ratio/mean": 1.0003423690795898, "sampling/importance_sampling_ratio/min": 0.6546946167945862, "sampling/sampling_logp_difference/max": 0.423586368560791, "sampling/sampling_logp_difference/mean": 0.015872148796916008, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 237.34375, "completions/mean_terminated_length": 237.34375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3297128975391388, "epoch": 2.4791666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6557936677120535, "kl": 0.0265534445643425, "learning_rate": 9.052615482814069e-08, "loss": 0.0173, "num_tokens": 63901064.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2983318567276, "sampling/importance_sampling_ratio/mean": 0.9998232126235962, "sampling/importance_sampling_ratio/min": 0.6395631432533264, "sampling/sampling_logp_difference/max": 0.44696998596191406, "sampling/sampling_logp_difference/mean": 0.011355316266417503, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 200.953125, "completions/mean_terminated_length": 200.953125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4074874520301819, "epoch": 2.480392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.0673112340216897, "kl": 0.0918905958533287, "learning_rate": 9.011775876948096e-08, "loss": 0.0229, "num_tokens": 63928997.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4352679252624512, "sampling/importance_sampling_ratio/mean": 1.0004394054412842, "sampling/importance_sampling_ratio/min": 0.6043077707290649, "sampling/sampling_logp_difference/max": 0.5036716461181641, "sampling/sampling_logp_difference/mean": 0.015174496918916702, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 181.1875, "completions/mean_terminated_length": 181.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.39710932970046997, "epoch": 2.4816176470588234, "frac_reward_zero_std": 0.25, "grad_norm": 1.6652212529846395, "kl": 0.05999930948019028, "learning_rate": 8.971019474580427e-08, "loss": -0.0612, "num_tokens": 63954081.0, "reward": 0.1875, "reward_std": 0.6525881886482239, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.2878774404525757, "sampling/importance_sampling_ratio/mean": 0.9994023442268372, "sampling/importance_sampling_ratio/min": 0.6396613121032715, "sampling/sampling_logp_difference/max": 0.44681644439697266, "sampling/sampling_logp_difference/mean": 0.016382107511162758, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 179.9375, "completions/mean_terminated_length": 179.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4748602509498596, "epoch": 2.482843137254902, "frac_reward_zero_std": 0.5, "grad_norm": 1.4228785543022433, "kl": 0.0473778061568737, "learning_rate": 8.930346358443953e-08, "loss": -0.0277, "num_tokens": 63980269.0, "reward": 0.21875, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.320014238357544, "sampling/importance_sampling_ratio/mean": 0.9999608993530273, "sampling/importance_sampling_ratio/min": 0.6395493745803833, "sampling/sampling_logp_difference/max": 0.4469914436340332, "sampling/sampling_logp_difference/mean": 0.01677963137626648, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 195.46875, "completions/mean_terminated_length": 195.46875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.45336729288101196, "epoch": 2.4840686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.05164388911331413, "kl": 0.0730249434709549, "learning_rate": 8.889756611102539e-08, "loss": 0.0008, "num_tokens": 64007307.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6210100650787354, "sampling/importance_sampling_ratio/mean": 1.0003575086593628, "sampling/importance_sampling_ratio/min": 0.6512575149536133, "sampling/sampling_logp_difference/max": 0.4830493927001953, "sampling/sampling_logp_difference/mean": 0.016209498047828674, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 192.90625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3370332717895508, "epoch": 2.485294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.014438929385510302, "kl": 0.027498988434672356, "learning_rate": 8.84925031495079e-08, "loss": 0.0003, "num_tokens": 64036853.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4217779636383057, "sampling/importance_sampling_ratio/mean": 1.0000685453414917, "sampling/importance_sampling_ratio/min": 0.5487043857574463, "sampling/sampling_logp_difference/max": 0.6001954078674316, "sampling/sampling_logp_difference/mean": 0.013127630576491356, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 171.5625, "completions/mean_terminated_length": 171.5625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4325448274612427, "epoch": 2.486519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.2946288954815135, "kl": 0.043557170778512955, "learning_rate": 8.808827552213916e-08, "loss": -0.0239, "num_tokens": 64062217.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8101012706756592, "sampling/importance_sampling_ratio/mean": 1.0000345706939697, "sampling/importance_sampling_ratio/min": 0.6701188087463379, "sampling/sampling_logp_difference/max": 0.5933828353881836, "sampling/sampling_logp_difference/mean": 0.017916850745677948, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 153.078125, "completions/mean_terminated_length": 153.078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.39488697052001953, "epoch": 2.4877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.02236514120023166, "kl": 0.0324961319565773, "learning_rate": 8.768488404947593e-08, "loss": 0.0003, "num_tokens": 64088398.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4411685466766357, "sampling/importance_sampling_ratio/mean": 0.9999464750289917, "sampling/importance_sampling_ratio/min": 0.704010546207428, "sampling/sampling_logp_difference/max": 0.36545419692993164, "sampling/sampling_logp_difference/mean": 0.016623370349407196, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 212.53125, "completions/mean_terminated_length": 212.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3667662739753723, "epoch": 2.488970588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020995587781656774, "kl": 0.04251813888549805, "learning_rate": 8.728232955037696e-08, "loss": 0.0004, "num_tokens": 64119408.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5746550559997559, "sampling/importance_sampling_ratio/mean": 0.9998937845230103, "sampling/importance_sampling_ratio/min": 0.6589528918266296, "sampling/sampling_logp_difference/max": 0.45403623580932617, "sampling/sampling_logp_difference/mean": 0.014021791517734528, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 196.828125, "completions/mean_terminated_length": 196.828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.33141422271728516, "epoch": 2.4901960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.016957718744736568, "kl": 0.024404536932706833, "learning_rate": 8.688061284200265e-08, "loss": 0.0002, "num_tokens": 64150885.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3832037448883057, "sampling/importance_sampling_ratio/mean": 0.9999232888221741, "sampling/importance_sampling_ratio/min": 0.7788761854171753, "sampling/sampling_logp_difference/max": 0.3244023323059082, "sampling/sampling_logp_difference/mean": 0.013108965009450912, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 239.671875, "completions/mean_terminated_length": 239.671875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.47033601999282837, "epoch": 2.491421568627451, "frac_reward_zero_std": 0.75, "grad_norm": 0.7357835453529568, "kl": 0.07376972585916519, "learning_rate": 8.647973473981224e-08, "loss": -0.0187, "num_tokens": 64185872.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.400166630744934, "sampling/importance_sampling_ratio/mean": 0.9998288154602051, "sampling/importance_sampling_ratio/min": 0.6527488827705383, "sampling/sampling_logp_difference/max": 0.4265627861022949, "sampling/sampling_logp_difference/mean": 0.015554027631878853, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 181.96875, "completions/mean_terminated_length": 181.96875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.40833908319473267, "epoch": 2.4926470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.027525734351921034, "kl": 0.035365961492061615, "learning_rate": 8.607969605756315e-08, "loss": 0.0004, "num_tokens": 64215886.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4768397808074951, "sampling/importance_sampling_ratio/mean": 1.0000439882278442, "sampling/importance_sampling_ratio/min": 0.6260562539100647, "sampling/sampling_logp_difference/max": 0.46831512451171875, "sampling/sampling_logp_difference/mean": 0.015282193198800087, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 211.578125, "completions/mean_terminated_length": 211.578125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4208946228027344, "epoch": 2.493872549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.017929340389628883, "kl": 0.02399945817887783, "learning_rate": 8.568049760730838e-08, "loss": 0.0002, "num_tokens": 64250579.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.515325903892517, "sampling/importance_sampling_ratio/mean": 0.9996589422225952, "sampling/importance_sampling_ratio/min": 0.6436850428581238, "sampling/sampling_logp_difference/max": 0.44054579734802246, "sampling/sampling_logp_difference/mean": 0.016491299495100975, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 207.796875, "completions/mean_terminated_length": 207.796875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.466720312833786, "epoch": 2.4950980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 1.1012710581858423, "kl": 0.05975695699453354, "learning_rate": 8.52821401993955e-08, "loss": 0.0114, "num_tokens": 64282758.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5589814186096191, "sampling/importance_sampling_ratio/mean": 1.0004221200942993, "sampling/importance_sampling_ratio/min": 0.37357693910598755, "sampling/sampling_logp_difference/max": 0.9846312999725342, "sampling/sampling_logp_difference/mean": 0.016525056213140488, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 177.65625, "completions/mean_terminated_length": 177.65625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.36008650064468384, "epoch": 2.4963235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.01835023940067309, "kl": 0.02421746775507927, "learning_rate": 8.488462464246493e-08, "loss": 0.0002, "num_tokens": 64314416.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5008811950683594, "sampling/importance_sampling_ratio/mean": 0.999953031539917, "sampling/importance_sampling_ratio/min": 0.6886656284332275, "sampling/sampling_logp_difference/max": 0.4060523509979248, "sampling/sampling_logp_difference/mean": 0.015186588279902935, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 155.78125, "completions/mean_terminated_length": 155.78125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3260124921798706, "epoch": 2.497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.031217136219159305, "kl": 0.03108660690486431, "learning_rate": 8.448795174344803e-08, "loss": 0.0003, "num_tokens": 64341890.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4375299215316772, "sampling/importance_sampling_ratio/mean": 1.0002895593643188, "sampling/importance_sampling_ratio/min": 0.6622427105903625, "sampling/sampling_logp_difference/max": 0.4121232032775879, "sampling/sampling_logp_difference/mean": 0.01380416564643383, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3441891372203827, "epoch": 2.498774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.020790095945366873, "kl": 0.03097061812877655, "learning_rate": 8.409212230756563e-08, "loss": 0.0003, "num_tokens": 64367470.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4096341133117676, "sampling/importance_sampling_ratio/mean": 1.0000736713409424, "sampling/importance_sampling_ratio/min": 0.5910730957984924, "sampling/sampling_logp_difference/max": 0.5258156061172485, "sampling/sampling_logp_difference/mean": 0.014658539555966854, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 219.296875, "completions/mean_terminated_length": 219.296875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3873305916786194, "epoch": 2.5, "frac_reward_zero_std": 0.75, "grad_norm": 0.8664369968312178, "kl": 0.035865385085344315, "learning_rate": 8.369713713832622e-08, "loss": 0.0096, "num_tokens": 64400961.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4710254669189453, "sampling/importance_sampling_ratio/mean": 1.0000736713409424, "sampling/importance_sampling_ratio/min": 0.6631979942321777, "sampling/sampling_logp_difference/max": 0.41068172454833984, "sampling/sampling_logp_difference/mean": 0.013265905901789665, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 197.734375, "completions/mean_terminated_length": 197.734375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4702549874782562, "epoch": 2.501225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.9499653475974037, "kl": 0.07387121766805649, "learning_rate": 8.330299703752497e-08, "loss": 0.0165, "num_tokens": 64434496.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5756784677505493, "sampling/importance_sampling_ratio/mean": 1.0000019073486328, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4679851531982422, "sampling/sampling_logp_difference/mean": 0.016622673720121384, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 181.453125, "completions/mean_terminated_length": 181.453125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.41390663385391235, "epoch": 2.502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8451944335942622, "kl": 0.05394618213176727, "learning_rate": 8.290970280524124e-08, "loss": -0.0286, "num_tokens": 64460941.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6247601509094238, "sampling/importance_sampling_ratio/mean": 0.9998599886894226, "sampling/importance_sampling_ratio/min": 0.5128727555274963, "sampling/sampling_logp_difference/max": 0.6677275896072388, "sampling/sampling_logp_difference/mean": 0.016559142619371414, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 148.09375, "completions/mean_terminated_length": 148.09375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3281470835208893, "epoch": 2.5036764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.018593121964325745, "kl": 0.02637426368892193, "learning_rate": 8.251725523983722e-08, "loss": 0.0003, "num_tokens": 64485955.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6862460374832153, "sampling/importance_sampling_ratio/mean": 1.000045657157898, "sampling/importance_sampling_ratio/min": 0.6827670335769653, "sampling/sampling_logp_difference/max": 0.5225048065185547, "sampling/sampling_logp_difference/mean": 0.015443524345755577, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 223.96875, "completions/mean_terminated_length": 223.96875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4159507155418396, "epoch": 2.5049019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.3564581012158, "kl": 0.048696912825107574, "learning_rate": 8.212565513795683e-08, "loss": -0.0395, "num_tokens": 64520513.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6581302881240845, "sampling/importance_sampling_ratio/mean": 1.0005327463150024, "sampling/importance_sampling_ratio/min": 0.6661763787269592, "sampling/sampling_logp_difference/max": 0.5056905746459961, "sampling/sampling_logp_difference/mean": 0.015493962913751602, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 197.84375, "completions/mean_terminated_length": 197.84375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3104378879070282, "epoch": 2.506127450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.013666259204105519, "kl": 0.019603494554758072, "learning_rate": 8.173490329452343e-08, "loss": 0.0002, "num_tokens": 64551111.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4720368385314941, "sampling/importance_sampling_ratio/mean": 0.9996908903121948, "sampling/importance_sampling_ratio/min": 0.7729811668395996, "sampling/sampling_logp_difference/max": 0.38664698600769043, "sampling/sampling_logp_difference/mean": 0.013405261561274529, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 219.296875, "completions/mean_terminated_length": 219.296875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3943178057670593, "epoch": 2.5073529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01440045804750558, "kl": 0.023328255861997604, "learning_rate": 8.13450005027384e-08, "loss": 0.0002, "num_tokens": 64582458.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.629729151725769, "sampling/importance_sampling_ratio/mean": 0.9995909929275513, "sampling/importance_sampling_ratio/min": 0.6272247433662415, "sampling/sampling_logp_difference/max": 0.48841381072998047, "sampling/sampling_logp_difference/mean": 0.0153773482888937, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 212.671875, "completions/mean_terminated_length": 212.671875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.36984264850616455, "epoch": 2.508578431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.1698487761148253, "kl": 0.03360015153884888, "learning_rate": 8.09559475540797e-08, "loss": 0.0172, "num_tokens": 64614261.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.7476072311401367, "sampling/importance_sampling_ratio/mean": 0.9999226927757263, "sampling/importance_sampling_ratio/min": 0.48961153626441956, "sampling/sampling_logp_difference/max": 0.7141430377960205, "sampling/sampling_logp_difference/mean": 0.015418700873851776, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 182.46875, "completions/mean_terminated_length": 182.46875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32081127166748047, "epoch": 2.5098039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.017766140029201693, "kl": 0.023535825312137604, "learning_rate": 8.056774523830029e-08, "loss": 0.0002, "num_tokens": 64639251.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4059449434280396, "sampling/importance_sampling_ratio/mean": 1.0003461837768555, "sampling/importance_sampling_ratio/min": 0.6841716766357422, "sampling/sampling_logp_difference/max": 0.3795464038848877, "sampling/sampling_logp_difference/mean": 0.013239240273833275, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 195.65625, "completions/mean_terminated_length": 195.65625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3986157178878784, "epoch": 2.5110294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.015950740997979006, "kl": 0.02680913172662258, "learning_rate": 8.018039434342627e-08, "loss": 0.0003, "num_tokens": 64669229.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7550904750823975, "sampling/importance_sampling_ratio/mean": 0.9998959898948669, "sampling/importance_sampling_ratio/min": 0.6712509393692017, "sampling/sampling_logp_difference/max": 0.5625203847885132, "sampling/sampling_logp_difference/mean": 0.01490098051726818, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 207.90625, "completions/mean_terminated_length": 207.90625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.39417797327041626, "epoch": 2.5122549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.9804432327070892, "kl": 0.03181571513414383, "learning_rate": 7.979389565575522e-08, "loss": -0.0376, "num_tokens": 64704519.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6269478797912598, "sampling/importance_sampling_ratio/mean": 0.9998840689659119, "sampling/importance_sampling_ratio/min": 0.6305050849914551, "sampling/sampling_logp_difference/max": 0.4867057800292969, "sampling/sampling_logp_difference/mean": 0.013905135914683342, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 201.8125, "completions/mean_terminated_length": 201.8125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.39764657616615295, "epoch": 2.513480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.009551746613855, "kl": 0.06586883962154388, "learning_rate": 7.940824995985528e-08, "loss": 0.0059, "num_tokens": 64733723.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5073206424713135, "sampling/importance_sampling_ratio/mean": 1.000825047492981, "sampling/importance_sampling_ratio/min": 0.6488670706748962, "sampling/sampling_logp_difference/max": 0.43252742290496826, "sampling/sampling_logp_difference/mean": 0.01538984663784504, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 199.421875, "completions/mean_terminated_length": 199.421875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3452928066253662, "epoch": 2.514705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.01763087271025228, "kl": 0.024976499378681183, "learning_rate": 7.902345803856264e-08, "loss": 0.0002, "num_tokens": 64766246.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.313920259475708, "sampling/importance_sampling_ratio/mean": 1.0007414817810059, "sampling/importance_sampling_ratio/min": 0.6838679313659668, "sampling/sampling_logp_difference/max": 0.37999045848846436, "sampling/sampling_logp_difference/mean": 0.01332725677639246, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 205.84375, "completions/mean_terminated_length": 205.84375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.28712159395217896, "epoch": 2.5159313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.012941324844786961, "kl": 0.014969379641115665, "learning_rate": 7.863952067298041e-08, "loss": 0.0001, "num_tokens": 64797692.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5468515157699585, "sampling/importance_sampling_ratio/mean": 1.0001466274261475, "sampling/importance_sampling_ratio/min": 0.5197044610977173, "sampling/sampling_logp_difference/max": 0.6544950008392334, "sampling/sampling_logp_difference/mean": 0.012371763586997986, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 208.4375, "completions/mean_terminated_length": 208.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.5194035768508911, "epoch": 2.517156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.8939359137696472, "kl": 0.06294498592615128, "learning_rate": 7.825643864247733e-08, "loss": -0.0189, "num_tokens": 64834056.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.326941728591919, "sampling/importance_sampling_ratio/mean": 1.0005096197128296, "sampling/importance_sampling_ratio/min": 0.7042613625526428, "sampling/sampling_logp_difference/max": 0.35060572624206543, "sampling/sampling_logp_difference/mean": 0.017616376280784607, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 181.828125, "completions/mean_terminated_length": 181.828125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.35135945677757263, "epoch": 2.5183823529411766, "frac_reward_zero_std": 0.5, "grad_norm": 1.2998194419705107, "kl": 0.037417348474264145, "learning_rate": 7.787421272468547e-08, "loss": 0.0299, "num_tokens": 64865501.0, "reward": 0.84375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4126293659210205, "sampling/importance_sampling_ratio/mean": 0.9993284344673157, "sampling/importance_sampling_ratio/min": 0.6157500743865967, "sampling/sampling_logp_difference/max": 0.48491406440734863, "sampling/sampling_logp_difference/mean": 0.014803480356931686, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 185.59375, "completions/mean_terminated_length": 185.59375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.4694395661354065, "epoch": 2.519607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.944606688946615, "kl": 0.045927416533231735, "learning_rate": 7.749284369549952e-08, "loss": -0.0029, "num_tokens": 64892723.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3966535329818726, "sampling/importance_sampling_ratio/mean": 0.9997533559799194, "sampling/importance_sampling_ratio/min": 0.6684202551841736, "sampling/sampling_logp_difference/max": 0.40283823013305664, "sampling/sampling_logp_difference/mean": 0.01650446094572544, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 181.265625, "completions/mean_terminated_length": 181.265625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.48010629415512085, "epoch": 2.5208333333333335, "frac_reward_zero_std": 0.5, "grad_norm": 1.652227345604709, "kl": 0.05787282437086105, "learning_rate": 7.711233232907399e-08, "loss": 0.0598, "num_tokens": 64922260.0, "reward": 0.59375, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000523567199707, "sampling/importance_sampling_ratio/min": 0.6213348507881165, "sampling/sampling_logp_difference/max": 0.996757984161377, "sampling/sampling_logp_difference/mean": 0.017763352021574974, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 209.0625, "completions/mean_terminated_length": 209.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4046059250831604, "epoch": 2.5220588235294117, "frac_reward_zero_std": 0.5, "grad_norm": 1.2512937563054876, "kl": 0.03622438386082649, "learning_rate": 7.673267939782324e-08, "loss": 0.0188, "num_tokens": 64957576.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6640219688415527, "sampling/importance_sampling_ratio/mean": 1.0006648302078247, "sampling/importance_sampling_ratio/min": 0.690620481967926, "sampling/sampling_logp_difference/max": 0.50923752784729, "sampling/sampling_logp_difference/mean": 0.014391078613698483, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.33976441621780396, "epoch": 2.5232843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.013465433339745361, "kl": 0.020324591547250748, "learning_rate": 7.63538856724184e-08, "loss": 0.0002, "num_tokens": 64992968.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7948695421218872, "sampling/importance_sampling_ratio/mean": 1.0009374618530273, "sampling/importance_sampling_ratio/min": 0.5868720412254333, "sampling/sampling_logp_difference/max": 0.5849323272705078, "sampling/sampling_logp_difference/mean": 0.014407450333237648, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.40805691480636597, "epoch": 2.5245098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.8261104427261089, "kl": 0.028869224712252617, "learning_rate": 7.597595192178702e-08, "loss": -0.0153, "num_tokens": 65022128.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5972583293914795, "sampling/importance_sampling_ratio/mean": 0.9998906254768372, "sampling/importance_sampling_ratio/min": 0.6615021228790283, "sampling/sampling_logp_difference/max": 0.4682886600494385, "sampling/sampling_logp_difference/mean": 0.015856822952628136, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 181.359375, "completions/mean_terminated_length": 181.359375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.37535834312438965, "epoch": 2.525735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.0106305715428294, "kl": 0.054244644939899445, "learning_rate": 7.559887891311046e-08, "loss": 0.0058, "num_tokens": 65049015.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.562191367149353, "sampling/importance_sampling_ratio/mean": 0.9995304346084595, "sampling/importance_sampling_ratio/min": 0.6192928552627563, "sampling/sampling_logp_difference/max": 0.47917699813842773, "sampling/sampling_logp_difference/mean": 0.01474264170974493, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 190.734375, "completions/mean_terminated_length": 190.734375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4425475597381592, "epoch": 2.5269607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 1.0163932395417137, "kl": 0.02658260613679886, "learning_rate": 7.522266741182303e-08, "loss": 0.0202, "num_tokens": 65085830.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6160290241241455, "sampling/importance_sampling_ratio/mean": 0.9997236132621765, "sampling/importance_sampling_ratio/min": 0.6990983486175537, "sampling/sampling_logp_difference/max": 0.47997188568115234, "sampling/sampling_logp_difference/mean": 0.015597082674503326, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 211.28125, "completions/mean_terminated_length": 211.28125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4401535391807556, "epoch": 2.528186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.8937973539372797, "kl": 0.061278894543647766, "learning_rate": 7.484731818161049e-08, "loss": 0.0284, "num_tokens": 65113880.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.822419285774231, "sampling/importance_sampling_ratio/mean": 0.9997953772544861, "sampling/importance_sampling_ratio/min": 0.4811238646507263, "sampling/sampling_logp_difference/max": 0.7316305637359619, "sampling/sampling_logp_difference/mean": 0.015561332926154137, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 159.34375, "completions/mean_terminated_length": 159.34375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.4249591827392578, "epoch": 2.5294117647058822, "frac_reward_zero_std": 0.75, "grad_norm": 1.0349410570256068, "kl": 0.04541175812482834, "learning_rate": 7.447283198440763e-08, "loss": -0.0051, "num_tokens": 65138686.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3719853162765503, "sampling/importance_sampling_ratio/mean": 0.9995375871658325, "sampling/importance_sampling_ratio/min": 0.48112422227859497, "sampling/sampling_logp_difference/max": 0.7316298484802246, "sampling/sampling_logp_difference/mean": 0.016625817865133286, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3495437502861023, "epoch": 2.530637254901961, "frac_reward_zero_std": 0.75, "grad_norm": 1.077871423935281, "kl": 0.02568657323718071, "learning_rate": 7.409920958039794e-08, "loss": 0.012, "num_tokens": 65177006.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.609712839126587, "sampling/importance_sampling_ratio/mean": 0.9997337460517883, "sampling/importance_sampling_ratio/min": 0.6248358488082886, "sampling/sampling_logp_difference/max": 0.4760558605194092, "sampling/sampling_logp_difference/mean": 0.014553414657711983, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 185.546875, "completions/mean_terminated_length": 185.546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.36232730746269226, "epoch": 2.531862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.01971203032382966, "kl": 0.037130288779735565, "learning_rate": 7.372645172801112e-08, "loss": 0.0004, "num_tokens": 65205921.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4028819799423218, "sampling/importance_sampling_ratio/mean": 0.9995936155319214, "sampling/importance_sampling_ratio/min": 0.6272878050804138, "sampling/sampling_logp_difference/max": 0.46634984016418457, "sampling/sampling_logp_difference/mean": 0.014208411797881126, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 201.359375, "completions/mean_terminated_length": 201.359375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.43340033292770386, "epoch": 2.5330882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.8205182366007592, "kl": 0.03927885368466377, "learning_rate": 7.335455918392219e-08, "loss": 0.0102, "num_tokens": 65239512.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.379061222076416, "sampling/importance_sampling_ratio/mean": 1.0003657341003418, "sampling/importance_sampling_ratio/min": 0.6418119072914124, "sampling/sampling_logp_difference/max": 0.44345998764038086, "sampling/sampling_logp_difference/mean": 0.01631690375506878, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 206.796875, "completions/mean_terminated_length": 206.796875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3557499051094055, "epoch": 2.534313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01973796411473359, "kl": 0.025097548961639404, "learning_rate": 7.29835327030493e-08, "loss": 0.0002, "num_tokens": 65267323.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5297231674194336, "sampling/importance_sampling_ratio/mean": 1.0004937648773193, "sampling/importance_sampling_ratio/min": 0.6882672309875488, "sampling/sampling_logp_difference/max": 0.42508673667907715, "sampling/sampling_logp_difference/mean": 0.014410475268959999, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 210.4375, "completions/mean_terminated_length": 210.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.44263601303100586, "epoch": 2.5355392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8567327170284087, "kl": 0.033141568303108215, "learning_rate": 7.261337303855258e-08, "loss": 0.0189, "num_tokens": 65300071.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4919159412384033, "sampling/importance_sampling_ratio/mean": 1.0003559589385986, "sampling/importance_sampling_ratio/min": 0.7112941741943359, "sampling/sampling_logp_difference/max": 0.40006113052368164, "sampling/sampling_logp_difference/mean": 0.017117420211434364, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 201.015625, "completions/mean_terminated_length": 201.015625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4219868779182434, "epoch": 2.536764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 1.1735720986413527, "kl": 0.04875606298446655, "learning_rate": 7.224408094183299e-08, "loss": 0.0306, "num_tokens": 65327272.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4342132806777954, "sampling/importance_sampling_ratio/mean": 1.0001598596572876, "sampling/importance_sampling_ratio/min": 0.6172976493835449, "sampling/sampling_logp_difference/max": 0.4824039936065674, "sampling/sampling_logp_difference/mean": 0.016299307346343994, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 206.828125, "completions/mean_terminated_length": 206.828125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.41150209307670593, "epoch": 2.5379901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.22991337523865, "kl": 0.04031134024262428, "learning_rate": 7.187565716252991e-08, "loss": -0.0319, "num_tokens": 65355677.0, "reward": 0.3125, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.44028902053833, "sampling/importance_sampling_ratio/mean": 0.9997075796127319, "sampling/importance_sampling_ratio/min": 0.654695451259613, "sampling/sampling_logp_difference/max": 0.42358505725860596, "sampling/sampling_logp_difference/mean": 0.014406203292310238, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 197.109375, "completions/mean_terminated_length": 197.109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.37234973907470703, "epoch": 2.5392156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.01896846344675893, "kl": 0.02687685564160347, "learning_rate": 7.150810244852035e-08, "loss": 0.0002, "num_tokens": 65384580.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6081585884094238, "sampling/importance_sampling_ratio/mean": 1.000108242034912, "sampling/importance_sampling_ratio/min": 0.6136998534202576, "sampling/sampling_logp_difference/max": 0.4882493019104004, "sampling/sampling_logp_difference/mean": 0.015685098245739937, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 182.28125, "completions/mean_terminated_length": 182.28125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.40313535928726196, "epoch": 2.5404411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.016768388250676957, "kl": 0.02673422172665596, "learning_rate": 7.114141754591691e-08, "loss": 0.0003, "num_tokens": 65415206.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.551025390625, "sampling/importance_sampling_ratio/mean": 0.9990107417106628, "sampling/importance_sampling_ratio/min": 0.6119214296340942, "sampling/sampling_logp_difference/max": 0.4911513328552246, "sampling/sampling_logp_difference/mean": 0.015913192182779312, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 173.953125, "completions/mean_terminated_length": 173.953125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3046499490737915, "epoch": 2.5416666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.01361368443527799, "kl": 0.020227601751685143, "learning_rate": 7.077560319906694e-08, "loss": 0.0002, "num_tokens": 65443491.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.403162956237793, "sampling/importance_sampling_ratio/mean": 0.9997733235359192, "sampling/importance_sampling_ratio/min": 0.5613439679145813, "sampling/sampling_logp_difference/max": 0.5774214267730713, "sampling/sampling_logp_difference/mean": 0.012156651355326176, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.30056145787239075, "epoch": 2.542892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.013041721244658643, "kl": 0.02197718247771263, "learning_rate": 7.041066015055036e-08, "loss": 0.0002, "num_tokens": 65477979.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4096847772598267, "sampling/importance_sampling_ratio/mean": 1.000238299369812, "sampling/importance_sampling_ratio/min": 0.49773162603378296, "sampling/sampling_logp_difference/max": 0.6976943016052246, "sampling/sampling_logp_difference/mean": 0.012997021898627281, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3763963282108307, "epoch": 2.5441176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.9779476142011609, "kl": 0.03305581212043762, "learning_rate": 7.004658914117822e-08, "loss": 0.0217, "num_tokens": 65506347.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6296724081039429, "sampling/importance_sampling_ratio/mean": 1.0001276731491089, "sampling/importance_sampling_ratio/min": 0.6284497380256653, "sampling/sampling_logp_difference/max": 0.48837900161743164, "sampling/sampling_logp_difference/mean": 0.01456437073647976, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 168.609375, "completions/mean_terminated_length": 168.609375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3052729368209839, "epoch": 2.545343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.015143259848826397, "kl": 0.02452005073428154, "learning_rate": 6.968339090999186e-08, "loss": 0.0002, "num_tokens": 65536002.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5346802473068237, "sampling/importance_sampling_ratio/mean": 1.0001373291015625, "sampling/importance_sampling_ratio/min": 0.6947748064994812, "sampling/sampling_logp_difference/max": 0.4283220171928406, "sampling/sampling_logp_difference/mean": 0.01278787013143301, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4861215651035309, "epoch": 2.5465686274509802, "frac_reward_zero_std": 0.75, "grad_norm": 0.8034146107780947, "kl": 0.05419921875, "learning_rate": 6.932106619426064e-08, "loss": 0.0064, "num_tokens": 65569430.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.685737133026123, "sampling/importance_sampling_ratio/mean": 0.9997113943099976, "sampling/importance_sampling_ratio/min": 0.7005658745765686, "sampling/sampling_logp_difference/max": 0.5222029685974121, "sampling/sampling_logp_difference/mean": 0.01634575054049492, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.35584941506385803, "epoch": 2.547794117647059, "frac_reward_zero_std": 0.75, "grad_norm": 1.0645978537399938, "kl": 0.04145277291536331, "learning_rate": 6.895961572948067e-08, "loss": 0.0041, "num_tokens": 65597182.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5775846242904663, "sampling/importance_sampling_ratio/mean": 0.9998557567596436, "sampling/importance_sampling_ratio/min": 0.6304898858070374, "sampling/sampling_logp_difference/max": 0.4612581729888916, "sampling/sampling_logp_difference/mean": 0.015014015138149261, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 199.40625, "completions/mean_terminated_length": 199.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.39090844988822937, "epoch": 2.549019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.016649257406454135, "kl": 0.028168104588985443, "learning_rate": 6.859904024937347e-08, "loss": 0.0003, "num_tokens": 65627784.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4348572492599487, "sampling/importance_sampling_ratio/mean": 1.0003767013549805, "sampling/importance_sampling_ratio/min": 0.6622411012649536, "sampling/sampling_logp_difference/max": 0.4121255874633789, "sampling/sampling_logp_difference/mean": 0.014203079044818878, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 219.671875, "completions/mean_terminated_length": 219.671875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.47287118434906006, "epoch": 2.5502450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.8151743040471998, "kl": 0.06747891008853912, "learning_rate": 6.823934048588459e-08, "loss": 0.0163, "num_tokens": 65658275.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.600164771080017, "sampling/importance_sampling_ratio/mean": 1.0001966953277588, "sampling/importance_sampling_ratio/min": 0.7139780521392822, "sampling/sampling_logp_difference/max": 0.4701066017150879, "sampling/sampling_logp_difference/mean": 0.0164007730782032, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 253.5, "completions/mean_terminated_length": 253.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.5424678325653076, "epoch": 2.5514705882352944, "frac_reward_zero_std": 0.25, "grad_norm": 1.3165769753311134, "kl": 0.048580899834632874, "learning_rate": 6.78805171691817e-08, "loss": -0.0135, "num_tokens": 65695171.0, "reward": 0.53125, "reward_std": 0.6331988573074341, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5378328561782837, "sampling/importance_sampling_ratio/mean": 0.9999392628669739, "sampling/importance_sampling_ratio/min": 0.6459082365036011, "sampling/sampling_logp_difference/max": 0.4370979070663452, "sampling/sampling_logp_difference/mean": 0.01762511022388935, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 195.703125, "completions/mean_terminated_length": 195.703125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4645097851753235, "epoch": 2.5526960784313726, "frac_reward_zero_std": 0.25, "grad_norm": 1.5832438382196996, "kl": 0.06714235246181488, "learning_rate": 6.752257102765324e-08, "loss": 0.0134, "num_tokens": 65733312.0, "reward": 0.4375, "reward_std": 0.6311737298965454, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.7630969285964966, "sampling/importance_sampling_ratio/mean": 1.0013240575790405, "sampling/importance_sampling_ratio/min": 0.5906386971473694, "sampling/sampling_logp_difference/max": 0.5670719146728516, "sampling/sampling_logp_difference/mean": 0.016704872250556946, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 249.953125, "completions/mean_terminated_length": 249.953125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4421851634979248, "epoch": 2.553921568627451, "frac_reward_zero_std": 0.75, "grad_norm": 1.0813640317510225, "kl": 0.03265643119812012, "learning_rate": 6.716550278790739e-08, "loss": -0.0087, "num_tokens": 65770685.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5154715776443481, "sampling/importance_sampling_ratio/mean": 1.0001493692398071, "sampling/importance_sampling_ratio/min": 0.6494601964950562, "sampling/sampling_logp_difference/max": 0.4316136837005615, "sampling/sampling_logp_difference/mean": 0.015306571498513222, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 188.890625, "completions/mean_terminated_length": 188.890625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3632579743862152, "epoch": 2.5551470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 0.8485382711143361, "kl": 0.05607233941555023, "learning_rate": 6.680931317476996e-08, "loss": -0.01, "num_tokens": 65797350.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3961634635925293, "sampling/importance_sampling_ratio/mean": 1.0000603199005127, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.41213345527648926, "sampling/sampling_logp_difference/mean": 0.013949640095233917, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 190.140625, "completions/mean_terminated_length": 190.140625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.5942714214324951, "epoch": 2.556372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.907817282126184, "kl": 0.07968472689390182, "learning_rate": 6.645400291128356e-08, "loss": -0.0028, "num_tokens": 65834143.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4339807033538818, "sampling/importance_sampling_ratio/mean": 0.9993922710418701, "sampling/importance_sampling_ratio/min": 0.61481773853302, "sampling/sampling_logp_difference/max": 0.48642945289611816, "sampling/sampling_logp_difference/mean": 0.019588496536016464, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 238.734375, "completions/mean_terminated_length": 238.734375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.48568403720855713, "epoch": 2.5575980392156863, "frac_reward_zero_std": 0.5, "grad_norm": 0.9744634120275073, "kl": 0.0538434162735939, "learning_rate": 6.609957271870503e-08, "loss": 0.0087, "num_tokens": 65869518.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5924615859985352, "sampling/importance_sampling_ratio/mean": 1.000096082687378, "sampling/importance_sampling_ratio/min": 0.6778700351715088, "sampling/sampling_logp_difference/max": 0.46528100967407227, "sampling/sampling_logp_difference/mean": 0.016062160953879356, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 197.296875, "completions/mean_terminated_length": 197.296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3151473104953766, "epoch": 2.5588235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.018567241166384255, "kl": 0.023612767457962036, "learning_rate": 6.574602331650559e-08, "loss": 0.0002, "num_tokens": 65898369.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5695054531097412, "sampling/importance_sampling_ratio/mean": 0.9994983673095703, "sampling/importance_sampling_ratio/min": 0.7195582985877991, "sampling/sampling_logp_difference/max": 0.4507606029510498, "sampling/sampling_logp_difference/mean": 0.01402480062097311, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 197.34375, "completions/mean_terminated_length": 197.34375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.445318341255188, "epoch": 2.560049019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.2179666722930946, "kl": 0.03929883986711502, "learning_rate": 6.539335542236802e-08, "loss": 0.0111, "num_tokens": 65929911.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5594457387924194, "sampling/importance_sampling_ratio/mean": 1.000316858291626, "sampling/importance_sampling_ratio/min": 0.6546961069107056, "sampling/sampling_logp_difference/max": 0.44433045387268066, "sampling/sampling_logp_difference/mean": 0.01643485575914383, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 180.421875, "completions/mean_terminated_length": 180.421875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.31945115327835083, "epoch": 2.561274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.01948292048956947, "kl": 0.026923775672912598, "learning_rate": 6.504156975218567e-08, "loss": 0.0002, "num_tokens": 65955698.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4169365167617798, "sampling/importance_sampling_ratio/mean": 0.9995955228805542, "sampling/importance_sampling_ratio/min": 0.6281131505966187, "sampling/sampling_logp_difference/max": 0.46503496170043945, "sampling/sampling_logp_difference/mean": 0.014066488482058048, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 257.484375, "completions/mean_terminated_length": 257.484375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4186283051967621, "epoch": 2.5625, "frac_reward_zero_std": 0.75, "grad_norm": 0.9985697505810378, "kl": 0.025043699890375137, "learning_rate": 6.469066702006137e-08, "loss": 0.0695, "num_tokens": 65988593.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.5223242044448853, "sampling/importance_sampling_ratio/mean": 0.9997868537902832, "sampling/importance_sampling_ratio/min": 0.6325936317443848, "sampling/sampling_logp_difference/max": 0.4579271078109741, "sampling/sampling_logp_difference/mean": 0.016202857717871666, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 215.578125, "completions/mean_terminated_length": 215.578125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.40918493270874023, "epoch": 2.563725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.02424830290448722, "kl": 0.03615477308630943, "learning_rate": 6.43406479383053e-08, "loss": 0.0004, "num_tokens": 66017734.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4315046072006226, "sampling/importance_sampling_ratio/mean": 0.9997341632843018, "sampling/importance_sampling_ratio/min": 0.6621752977371216, "sampling/sampling_logp_difference/max": 0.41222500801086426, "sampling/sampling_logp_difference/mean": 0.014971710741519928, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 135.828125, "completions/mean_terminated_length": 135.828125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.2993594706058502, "epoch": 2.564950980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.027237160966415087, "kl": 0.0391191691160202, "learning_rate": 6.399151321743423e-08, "loss": 0.0004, "num_tokens": 66037691.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.480431079864502, "sampling/importance_sampling_ratio/mean": 1.000072956085205, "sampling/importance_sampling_ratio/min": 0.661615252494812, "sampling/sampling_logp_difference/max": 0.4130711555480957, "sampling/sampling_logp_difference/mean": 0.014084149152040482, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 221.265625, "completions/mean_terminated_length": 221.265625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3643258213996887, "epoch": 2.5661764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.011489225833486107, "kl": 0.021840862929821014, "learning_rate": 6.364326356616917e-08, "loss": 0.0002, "num_tokens": 66077052.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6226626634597778, "sampling/importance_sampling_ratio/mean": 0.9994646906852722, "sampling/importance_sampling_ratio/min": 0.6210864782333374, "sampling/sampling_logp_difference/max": 0.4840683937072754, "sampling/sampling_logp_difference/mean": 0.014700526371598244, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 188.046875, "completions/mean_terminated_length": 188.046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.33661404252052307, "epoch": 2.5674019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.014335949900874318, "kl": 0.022317927330732346, "learning_rate": 6.329589969143517e-08, "loss": 0.0002, "num_tokens": 66107103.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.354552984237671, "sampling/importance_sampling_ratio/mean": 0.9997498989105225, "sampling/importance_sampling_ratio/min": 0.6162945628166199, "sampling/sampling_logp_difference/max": 0.48403024673461914, "sampling/sampling_logp_difference/mean": 0.013629063963890076, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 211.28125, "completions/mean_terminated_length": 211.28125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.4470069706439972, "epoch": 2.568627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.75070318927903, "kl": 0.05030011385679245, "learning_rate": 6.29494222983587e-08, "loss": 0.0328, "num_tokens": 66145953.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4384031295776367, "sampling/importance_sampling_ratio/mean": 0.999783456325531, "sampling/importance_sampling_ratio/min": 0.6539222002029419, "sampling/sampling_logp_difference/max": 0.4247668981552124, "sampling/sampling_logp_difference/mean": 0.015024135820567608, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 206.34375, "completions/mean_terminated_length": 206.34375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4453170895576477, "epoch": 2.5698529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9820557513300701, "kl": 0.04464089497923851, "learning_rate": 6.260383209026704e-08, "loss": 0.0161, "num_tokens": 66179607.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5414364337921143, "sampling/importance_sampling_ratio/mean": 0.9994871616363525, "sampling/importance_sampling_ratio/min": 0.6254764199256897, "sampling/sampling_logp_difference/max": 0.4692416191101074, "sampling/sampling_logp_difference/mean": 0.016002651304006577, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4869428277015686, "epoch": 2.571078431372549, "frac_reward_zero_std": 0.25, "grad_norm": 1.2831465130654096, "kl": 0.08460487425327301, "learning_rate": 6.225912976868636e-08, "loss": 0.017, "num_tokens": 66214071.0, "reward": 0.21875, "reward_std": 0.5539814233779907, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5390594005584717, "sampling/importance_sampling_ratio/mean": 1.0002871751785278, "sampling/importance_sampling_ratio/min": 0.5630738735198975, "sampling/sampling_logp_difference/max": 0.5743444561958313, "sampling/sampling_logp_difference/mean": 0.017172526568174362, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 219.484375, "completions/mean_terminated_length": 219.484375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3674548864364624, "epoch": 2.5723039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.021997753395319782, "kl": 0.03306593745946884, "learning_rate": 6.191531603334044e-08, "loss": 0.0003, "num_tokens": 66243110.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5714753866195679, "sampling/importance_sampling_ratio/mean": 0.9999969005584717, "sampling/importance_sampling_ratio/min": 0.5362951159477234, "sampling/sampling_logp_difference/max": 0.6230707168579102, "sampling/sampling_logp_difference/mean": 0.014687325805425644, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 211.828125, "completions/mean_terminated_length": 211.828125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.30457037687301636, "epoch": 2.5735294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.012549578551228192, "kl": 0.01527449395507574, "learning_rate": 6.157239158214966e-08, "loss": 0.0002, "num_tokens": 66279771.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5515893697738647, "sampling/importance_sampling_ratio/mean": 0.9998092651367188, "sampling/importance_sampling_ratio/min": 0.6406391859054565, "sampling/sampling_logp_difference/max": 0.44528889656066895, "sampling/sampling_logp_difference/mean": 0.012442250736057758, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 172.140625, "completions/mean_terminated_length": 172.140625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3210659325122833, "epoch": 2.5747549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.023644061330135357, "kl": 0.02690596878528595, "learning_rate": 6.123035711122859e-08, "loss": 0.0003, "num_tokens": 66308676.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 0.9999759197235107, "sampling/importance_sampling_ratio/min": 0.6546952724456787, "sampling/sampling_logp_difference/max": 0.49199533462524414, "sampling/sampling_logp_difference/mean": 0.014081955887377262, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 200.515625, "completions/mean_terminated_length": 200.515625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.374914288520813, "epoch": 2.575980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.018392236450731337, "kl": 0.02459472417831421, "learning_rate": 6.088921331488566e-08, "loss": 0.0002, "num_tokens": 66338325.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.363040804862976, "sampling/importance_sampling_ratio/mean": 1.000260591506958, "sampling/importance_sampling_ratio/min": 0.5137227177619934, "sampling/sampling_logp_difference/max": 0.6660715937614441, "sampling/sampling_logp_difference/mean": 0.014474395662546158, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 159.71875, "completions/mean_terminated_length": 159.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3740498721599579, "epoch": 2.577205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.9346502239688184, "kl": 0.028557591140270233, "learning_rate": 6.05489608856214e-08, "loss": -0.0112, "num_tokens": 66365443.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3664571046829224, "sampling/importance_sampling_ratio/mean": 0.9995359182357788, "sampling/importance_sampling_ratio/min": 0.6773549318313599, "sampling/sampling_logp_difference/max": 0.38955986499786377, "sampling/sampling_logp_difference/mean": 0.014828909188508987, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 222.9375, "completions/mean_terminated_length": 222.9375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.40943849086761475, "epoch": 2.5784313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.01977636754486806, "kl": 0.02581152133643627, "learning_rate": 6.020960051412638e-08, "loss": 0.0003, "num_tokens": 66396703.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4477920532226562, "sampling/importance_sampling_ratio/mean": 1.000298261642456, "sampling/importance_sampling_ratio/min": 0.5010133385658264, "sampling/sampling_logp_difference/max": 0.6911225318908691, "sampling/sampling_logp_difference/mean": 0.01465803012251854, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 211.140625, "completions/mean_terminated_length": 211.140625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3674590289592743, "epoch": 2.579656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.01807957295302773, "kl": 0.019815169274806976, "learning_rate": 5.98711328892808e-08, "loss": 0.0002, "num_tokens": 66429192.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6007360219955444, "sampling/importance_sampling_ratio/mean": 1.0005848407745361, "sampling/importance_sampling_ratio/min": 0.6564944982528687, "sampling/sampling_logp_difference/max": 0.47046351432800293, "sampling/sampling_logp_difference/mean": 0.014578346163034439, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 193.484375, "completions/mean_terminated_length": 193.484375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.47036683559417725, "epoch": 2.5808823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.9480830279283758, "kl": 0.06246185302734375, "learning_rate": 5.9533558698152355e-08, "loss": -0.0002, "num_tokens": 66460087.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4385663270950317, "sampling/importance_sampling_ratio/mean": 1.0000579357147217, "sampling/importance_sampling_ratio/min": 0.6575733423233032, "sampling/sampling_logp_difference/max": 0.41919898986816406, "sampling/sampling_logp_difference/mean": 0.016523167490959167, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 236.078125, "completions/mean_terminated_length": 236.078125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.4175148010253906, "epoch": 2.582107843137255, "frac_reward_zero_std": 0.5, "grad_norm": 0.9270753704537925, "kl": 0.04759068042039871, "learning_rate": 5.919687862599548e-08, "loss": 0.0208, "num_tokens": 66494028.0, "reward": 0.28125, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4843947887420654, "sampling/importance_sampling_ratio/mean": 0.9996776580810547, "sampling/importance_sampling_ratio/min": 0.6776774525642395, "sampling/sampling_logp_difference/max": 0.3950071334838867, "sampling/sampling_logp_difference/mean": 0.014096850529313087, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 178.796875, "completions/mean_terminated_length": 178.796875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3843807578086853, "epoch": 2.5833333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.01602146693048434, "kl": 0.028620852157473564, "learning_rate": 5.886109335624928e-08, "loss": 0.0003, "num_tokens": 66525407.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.564172387123108, "sampling/importance_sampling_ratio/mean": 1.0002281665802002, "sampling/importance_sampling_ratio/min": 0.6718854904174805, "sampling/sampling_logp_difference/max": 0.4473569393157959, "sampling/sampling_logp_difference/mean": 0.016141705214977264, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 131.109375, "completions/mean_terminated_length": 131.109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.28437256813049316, "epoch": 2.5845588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.02630938554479464, "kl": 0.028298180550336838, "learning_rate": 5.8526203570536504e-08, "loss": 0.0003, "num_tokens": 66546230.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3972299098968506, "sampling/importance_sampling_ratio/mean": 0.9994284510612488, "sampling/importance_sampling_ratio/min": 0.613646924495697, "sampling/sampling_logp_difference/max": 0.48833560943603516, "sampling/sampling_logp_difference/mean": 0.014699292369186878, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 155.953125, "completions/mean_terminated_length": 155.953125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3374735713005066, "epoch": 2.5857843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.02280321129490057, "kl": 0.04040870815515518, "learning_rate": 5.819220994866236e-08, "loss": 0.0004, "num_tokens": 66571299.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4676449298858643, "sampling/importance_sampling_ratio/mean": 0.9996737837791443, "sampling/importance_sampling_ratio/min": 0.6067817211151123, "sampling/sampling_logp_difference/max": 0.4995861053466797, "sampling/sampling_logp_difference/mean": 0.013758618384599686, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 168.734375, "completions/mean_terminated_length": 168.734375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3715071678161621, "epoch": 2.5870098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.01729749422561657, "kl": 0.03268400579690933, "learning_rate": 5.7859113168612696e-08, "loss": 0.0003, "num_tokens": 66600466.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4979567527770996, "sampling/importance_sampling_ratio/mean": 1.0001771450042725, "sampling/importance_sampling_ratio/min": 0.6424275040626526, "sampling/sampling_logp_difference/max": 0.4425013065338135, "sampling/sampling_logp_difference/mean": 0.015010682865977287, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.41879019141197205, "epoch": 2.588235294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.794542944353632, "kl": 0.03571142628788948, "learning_rate": 5.7526913906552786e-08, "loss": 0.0208, "num_tokens": 66639106.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5448921918869019, "sampling/importance_sampling_ratio/mean": 1.0000910758972168, "sampling/importance_sampling_ratio/min": 0.6546945571899414, "sampling/sampling_logp_difference/max": 0.4349541664123535, "sampling/sampling_logp_difference/mean": 0.0153944818302989, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 237.6875, "completions/mean_terminated_length": 237.6875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.36095866560935974, "epoch": 2.5894607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.83986554242995, "kl": 0.03923949971795082, "learning_rate": 5.7195612836826055e-08, "loss": 0.0041, "num_tokens": 66671694.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3939363956451416, "sampling/importance_sampling_ratio/mean": 0.9999719858169556, "sampling/importance_sampling_ratio/min": 0.5363647937774658, "sampling/sampling_logp_difference/max": 0.6229407787322998, "sampling/sampling_logp_difference/mean": 0.013769697397947311, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 208.328125, "completions/mean_terminated_length": 208.328125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.41904306411743164, "epoch": 2.590686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.9101347229153239, "kl": 0.028980256989598274, "learning_rate": 5.686521063195287e-08, "loss": 0.0194, "num_tokens": 66703491.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4888678789138794, "sampling/importance_sampling_ratio/mean": 0.9995335936546326, "sampling/importance_sampling_ratio/min": 0.682695209980011, "sampling/sampling_logp_difference/max": 0.39801597595214844, "sampling/sampling_logp_difference/mean": 0.0153053505346179, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3330810070037842, "epoch": 2.5919117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.012560079172912738, "kl": 0.019664783030748367, "learning_rate": 5.6535707962628685e-08, "loss": 0.0002, "num_tokens": 66741099.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4760103225708008, "sampling/importance_sampling_ratio/mean": 1.000353455543518, "sampling/importance_sampling_ratio/min": 0.376099169254303, "sampling/sampling_logp_difference/max": 0.9779024124145508, "sampling/sampling_logp_difference/mean": 0.012436985038220882, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 182.953125, "completions/mean_terminated_length": 182.953125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.5205093026161194, "epoch": 2.593137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.9135108534463635, "kl": 0.04284369945526123, "learning_rate": 5.620710549772295e-08, "loss": -0.0014, "num_tokens": 66775832.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4819895029067993, "sampling/importance_sampling_ratio/mean": 1.0004993677139282, "sampling/importance_sampling_ratio/min": 0.720510721206665, "sampling/sampling_logp_difference/max": 0.3933854103088379, "sampling/sampling_logp_difference/mean": 0.017911650240421295, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 182.296875, "completions/mean_terminated_length": 182.296875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.362684965133667, "epoch": 2.594362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.019456104270723005, "kl": 0.028201991692185402, "learning_rate": 5.5879403904278034e-08, "loss": 0.0003, "num_tokens": 66802315.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4301707744598389, "sampling/importance_sampling_ratio/mean": 0.9996477365493774, "sampling/importance_sampling_ratio/min": 0.5825095772743225, "sampling/sampling_logp_difference/max": 0.5404096841812134, "sampling/sampling_logp_difference/mean": 0.015181276947259903, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 194.265625, "completions/mean_terminated_length": 194.265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.4143337309360504, "epoch": 2.5955882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.2345566335019353, "kl": 0.07903826236724854, "learning_rate": 5.555260384750721e-08, "loss": 0.0075, "num_tokens": 66830556.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4668275117874146, "sampling/importance_sampling_ratio/mean": 0.9987766742706299, "sampling/importance_sampling_ratio/min": 0.6479809880256653, "sampling/sampling_logp_difference/max": 0.43389391899108887, "sampling/sampling_logp_difference/mean": 0.01558046042919159, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 209.484375, "completions/mean_terminated_length": 209.484375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.36364811658859253, "epoch": 2.596813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.013713065630409327, "kl": 0.021273497492074966, "learning_rate": 5.5226705990794156e-08, "loss": 0.0002, "num_tokens": 66868315.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6138299703598022, "sampling/importance_sampling_ratio/mean": 0.9999983310699463, "sampling/importance_sampling_ratio/min": 0.6207950115203857, "sampling/sampling_logp_difference/max": 0.4786102771759033, "sampling/sampling_logp_difference/mean": 0.01500872615724802, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4230864346027374, "epoch": 2.5980392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8143844606419269, "kl": 0.01911357045173645, "learning_rate": 5.4901710995690576e-08, "loss": -0.0274, "num_tokens": 66903019.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.6599781513214111, "sampling/importance_sampling_ratio/mean": 1.0002446174621582, "sampling/importance_sampling_ratio/min": 0.500320315361023, "sampling/sampling_logp_difference/max": 0.6925067901611328, "sampling/sampling_logp_difference/mean": 0.014856458641588688, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 202.1875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.4207639992237091, "epoch": 2.599264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03326433934880384, "kl": 0.05689454451203346, "learning_rate": 5.4577619521915916e-08, "loss": 0.0006, "num_tokens": 66934471.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5942623615264893, "sampling/importance_sampling_ratio/mean": 1.0002871751785278, "sampling/importance_sampling_ratio/min": 0.6772719621658325, "sampling/sampling_logp_difference/max": 0.46641111373901367, "sampling/sampling_logp_difference/mean": 0.01560080423951149, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 222.953125, "completions/mean_terminated_length": 222.953125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.33361703157424927, "epoch": 2.6004901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.8840086549830525, "kl": 0.024904098361730576, "learning_rate": 5.425443222735526e-08, "loss": -0.0015, "num_tokens": 66964452.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4255083799362183, "sampling/importance_sampling_ratio/mean": 1.0000474452972412, "sampling/importance_sampling_ratio/min": 0.7039909362792969, "sampling/sampling_logp_difference/max": 0.35452842712402344, "sampling/sampling_logp_difference/mean": 0.012197159230709076, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 253.734375, "completions/mean_terminated_length": 253.734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.5072479844093323, "epoch": 2.6017156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.6787254225108151, "kl": 0.052853070199489594, "learning_rate": 5.393214976805832e-08, "loss": -0.0094, "num_tokens": 67003027.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5244715213775635, "sampling/importance_sampling_ratio/mean": 1.0005197525024414, "sampling/importance_sampling_ratio/min": 0.6650070548057556, "sampling/sampling_logp_difference/max": 0.4216477870941162, "sampling/sampling_logp_difference/mean": 0.016826054081320763, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 178.234375, "completions/mean_terminated_length": 178.234375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.36420899629592896, "epoch": 2.6029411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.03384692700289003, "kl": 0.032734621316194534, "learning_rate": 5.361077279823817e-08, "loss": 0.0003, "num_tokens": 67031010.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6157547235488892, "sampling/importance_sampling_ratio/mean": 0.9999369978904724, "sampling/importance_sampling_ratio/min": 0.6351216435432434, "sampling/sampling_logp_difference/max": 0.47980213165283203, "sampling/sampling_logp_difference/mean": 0.01496285479515791, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 237.203125, "completions/mean_terminated_length": 237.203125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.38095271587371826, "epoch": 2.6041666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6323050600572573, "kl": 0.03123561292886734, "learning_rate": 5.3290301970269514e-08, "loss": 0.0046, "num_tokens": 67063119.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7322168350219727, "sampling/importance_sampling_ratio/mean": 1.0004644393920898, "sampling/importance_sampling_ratio/min": 0.6108182072639465, "sampling/sampling_logp_difference/max": 0.5494019985198975, "sampling/sampling_logp_difference/mean": 0.01421417761594057, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 158.78125, "completions/mean_terminated_length": 158.78125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3642742335796356, "epoch": 2.605392156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.034789694831971765, "kl": 0.05975145846605301, "learning_rate": 5.29707379346882e-08, "loss": 0.0006, "num_tokens": 67088753.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6307361125946045, "sampling/importance_sampling_ratio/mean": 1.0000109672546387, "sampling/importance_sampling_ratio/min": 0.6289353966712952, "sampling/sampling_logp_difference/max": 0.4890315532684326, "sampling/sampling_logp_difference/mean": 0.015447361394762993, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 227.78125, "completions/mean_terminated_length": 227.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.36701464653015137, "epoch": 2.6066176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.7879647133880849, "kl": 0.022777989506721497, "learning_rate": 5.2652081340188506e-08, "loss": -0.0569, "num_tokens": 67122307.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997912049293518, "sampling/importance_sampling_ratio/min": 0.6298379898071289, "sampling/sampling_logp_difference/max": 0.8105654716491699, "sampling/sampling_logp_difference/mean": 0.014233799651265144, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3423210680484772, "epoch": 2.607843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0162284263371988, "kl": 0.024179883301258087, "learning_rate": 5.2334332833623487e-08, "loss": 0.0002, "num_tokens": 67155763.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5865659713745117, "sampling/importance_sampling_ratio/mean": 1.000349998474121, "sampling/importance_sampling_ratio/min": 0.6727445125579834, "sampling/sampling_logp_difference/max": 0.46157193183898926, "sampling/sampling_logp_difference/mean": 0.014300025999546051, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 230.375, "completions/mean_terminated_length": 230.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.4718915522098541, "epoch": 2.6090686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.028572234533195292, "kl": 0.03860338404774666, "learning_rate": 5.2017493060002196e-08, "loss": 0.0004, "num_tokens": 67188203.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3974932432174683, "sampling/importance_sampling_ratio/mean": 0.9992986917495728, "sampling/importance_sampling_ratio/min": 0.6844436526298523, "sampling/sampling_logp_difference/max": 0.3791489601135254, "sampling/sampling_logp_difference/mean": 0.01681319996714592, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 203.109375, "completions/mean_terminated_length": 203.109375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.376884400844574, "epoch": 2.610294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.0171315248231306, "kl": 0.022489607334136963, "learning_rate": 5.1701562662489596e-08, "loss": 0.0002, "num_tokens": 67222898.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4988843202590942, "sampling/importance_sampling_ratio/mean": 1.000124216079712, "sampling/importance_sampling_ratio/min": 0.6411409974098206, "sampling/sampling_logp_difference/max": 0.4445059299468994, "sampling/sampling_logp_difference/mean": 0.015070099383592606, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 226.421875, "completions/mean_terminated_length": 226.421875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.43235793709754944, "epoch": 2.611519607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6793980733597528, "kl": 0.060259003192186356, "learning_rate": 5.138654228240424e-08, "loss": 0.0097, "num_tokens": 67255853.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3672351837158203, "sampling/importance_sampling_ratio/mean": 0.9999115467071533, "sampling/importance_sampling_ratio/min": 0.5359736084938049, "sampling/sampling_logp_difference/max": 0.6236703395843506, "sampling/sampling_logp_difference/mean": 0.015460221096873283, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 155.328125, "completions/mean_terminated_length": 155.328125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.39062735438346863, "epoch": 2.6127450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9959490995274431, "kl": 0.02826238051056862, "learning_rate": 5.1072432559217446e-08, "loss": 0.0213, "num_tokens": 67284562.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2868387699127197, "sampling/importance_sampling_ratio/mean": 1.000374436378479, "sampling/importance_sampling_ratio/min": 0.5039337873458862, "sampling/sampling_logp_difference/max": 0.6853103637695312, "sampling/sampling_logp_difference/mean": 0.014396356418728828, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 203.28125, "completions/mean_terminated_length": 203.28125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4167785048484802, "epoch": 2.6139705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.015378856424382125, "kl": 0.022126592695713043, "learning_rate": 5.075923413055222e-08, "loss": 0.0002, "num_tokens": 67313892.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5830098390579224, "sampling/importance_sampling_ratio/mean": 0.9998418092727661, "sampling/importance_sampling_ratio/min": 0.6997085809707642, "sampling/sampling_logp_difference/max": 0.45932793617248535, "sampling/sampling_logp_difference/mean": 0.0156828835606575, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 217.296875, "completions/mean_terminated_length": 217.296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3475598096847534, "epoch": 2.6151960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.7135612162620825, "kl": 0.02928490936756134, "learning_rate": 5.044694763218149e-08, "loss": -0.0157, "num_tokens": 67343959.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4025582075119019, "sampling/importance_sampling_ratio/mean": 0.9999585151672363, "sampling/importance_sampling_ratio/min": 0.6419162750244141, "sampling/sampling_logp_difference/max": 0.4432973861694336, "sampling/sampling_logp_difference/mean": 0.01262687612324953, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3160248398780823, "epoch": 2.616421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.01667655163182362, "kl": 0.01977652497589588, "learning_rate": 5.013557369802701e-08, "loss": 0.0002, "num_tokens": 67374191.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5577393770217896, "sampling/importance_sampling_ratio/mean": 1.0001202821731567, "sampling/importance_sampling_ratio/min": 0.7577202320098877, "sampling/sampling_logp_difference/max": 0.4432356357574463, "sampling/sampling_logp_difference/mean": 0.012885408475995064, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 172.453125, "completions/mean_terminated_length": 172.453125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.39916563034057617, "epoch": 2.6176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.020251231669469082, "kl": 0.027808837592601776, "learning_rate": 4.982511296015807e-08, "loss": 0.0003, "num_tokens": 67400700.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.8188602924346924, "sampling/importance_sampling_ratio/mean": 1.000488519668579, "sampling/importance_sampling_ratio/min": 0.6121949553489685, "sampling/sampling_logp_difference/max": 0.5982100963592529, "sampling/sampling_logp_difference/mean": 0.016949813812971115, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 189.359375, "completions/mean_terminated_length": 189.359375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3966854512691498, "epoch": 2.618872549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.035290026610468254, "kl": 0.06541240215301514, "learning_rate": 4.951556604879048e-08, "loss": 0.0006, "num_tokens": 67429363.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6362992525100708, "sampling/importance_sampling_ratio/mean": 0.999947190284729, "sampling/importance_sampling_ratio/min": 0.7212660908699036, "sampling/sampling_logp_difference/max": 0.49243712425231934, "sampling/sampling_logp_difference/mean": 0.015420676209032536, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 211.71875, "completions/mean_terminated_length": 211.71875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4054481089115143, "epoch": 2.6200980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.7655339023697475, "kl": 0.05712512135505676, "learning_rate": 4.9206933592284725e-08, "loss": -0.0016, "num_tokens": 67467905.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.439152717590332, "sampling/importance_sampling_ratio/mean": 0.9993346333503723, "sampling/importance_sampling_ratio/min": 0.5603407025337219, "sampling/sampling_logp_difference/max": 0.5792102813720703, "sampling/sampling_logp_difference/mean": 0.015379207208752632, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 208.359375, "completions/mean_terminated_length": 208.359375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.38255298137664795, "epoch": 2.6213235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.013470305053340506, "kl": 0.023093286901712418, "learning_rate": 4.889921621714516e-08, "loss": 0.0002, "num_tokens": 67505992.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.54817533493042, "sampling/importance_sampling_ratio/mean": 0.9997929334640503, "sampling/importance_sampling_ratio/min": 0.6157702207565308, "sampling/sampling_logp_difference/max": 0.4848814010620117, "sampling/sampling_logp_difference/mean": 0.01380122546106577, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 227.03125, "completions/mean_terminated_length": 227.03125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.46854493021965027, "epoch": 2.622549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.2791723949659457, "kl": 0.03231387585401535, "learning_rate": 4.859241454801866e-08, "loss": 0.0058, "num_tokens": 67541402.0, "reward": 0.4375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.664849042892456, "sampling/importance_sampling_ratio/mean": 0.9997316598892212, "sampling/importance_sampling_ratio/min": 0.6409977674484253, "sampling/sampling_logp_difference/max": 0.5097345113754272, "sampling/sampling_logp_difference/mean": 0.017020462080836296, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 216.703125, "completions/mean_terminated_length": 216.703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.30431610345840454, "epoch": 2.623774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.016013747254574693, "kl": 0.023328766226768494, "learning_rate": 4.828652920769311e-08, "loss": 0.0002, "num_tokens": 67573559.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.449704647064209, "sampling/importance_sampling_ratio/mean": 1.0002050399780273, "sampling/importance_sampling_ratio/min": 0.7126727104187012, "sampling/sampling_logp_difference/max": 0.37135982513427734, "sampling/sampling_logp_difference/mean": 0.011685984209179878, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 167.40625, "completions/mean_terminated_length": 167.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.34901028871536255, "epoch": 2.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0236117528688996, "kl": 0.025571728125214577, "learning_rate": 4.7981560817096366e-08, "loss": 0.0003, "num_tokens": 67601185.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4754151105880737, "sampling/importance_sampling_ratio/mean": 1.0006507635116577, "sampling/importance_sampling_ratio/min": 0.663468599319458, "sampling/sampling_logp_difference/max": 0.41027379035949707, "sampling/sampling_logp_difference/mean": 0.01529423613101244, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 160.46875, "completions/mean_terminated_length": 160.46875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3441200852394104, "epoch": 2.626225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.8936793610125214, "kl": 0.031151149421930313, "learning_rate": 4.767750999529485e-08, "loss": 0.0081, "num_tokens": 67625711.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6312365531921387, "sampling/importance_sampling_ratio/mean": 1.0001682043075562, "sampling/importance_sampling_ratio/min": 0.6543079018592834, "sampling/sampling_logp_difference/max": 0.48933839797973633, "sampling/sampling_logp_difference/mean": 0.014745216816663742, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 184.265625, "completions/mean_terminated_length": 184.265625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.31365156173706055, "epoch": 2.627450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.0837940597102713, "kl": 0.019635751843452454, "learning_rate": 4.7374377359492624e-08, "loss": 0.0229, "num_tokens": 67655728.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.311972975730896, "sampling/importance_sampling_ratio/mean": 0.9995397925376892, "sampling/importance_sampling_ratio/min": 0.38968032598495483, "sampling/sampling_logp_difference/max": 0.9424285888671875, "sampling/sampling_logp_difference/mean": 0.01384140457957983, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 187.453125, "completions/mean_terminated_length": 187.453125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4393818974494934, "epoch": 2.6286764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.018799716519235687, "kl": 0.025906246155500412, "learning_rate": 4.707216352502974e-08, "loss": 0.0003, "num_tokens": 67684461.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.609341025352478, "sampling/importance_sampling_ratio/mean": 0.9995216131210327, "sampling/importance_sampling_ratio/min": 0.6207955479621887, "sampling/sampling_logp_difference/max": 0.47675347328186035, "sampling/sampling_logp_difference/mean": 0.01651901751756668, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.49239933490753174, "epoch": 2.6299019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8208077058143795, "kl": 0.03966863825917244, "learning_rate": 4.6770869105380914e-08, "loss": -0.0083, "num_tokens": 67722757.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4605991840362549, "sampling/importance_sampling_ratio/mean": 0.9996809363365173, "sampling/importance_sampling_ratio/min": 0.660297155380249, "sampling/sampling_logp_difference/max": 0.41506528854370117, "sampling/sampling_logp_difference/mean": 0.016705110669136047, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 179.546875, "completions/mean_terminated_length": 179.546875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3138548731803894, "epoch": 2.631127450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.018924036857109135, "kl": 0.030965154990553856, "learning_rate": 4.647049471215497e-08, "loss": 0.0003, "num_tokens": 67752376.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3848775625228882, "sampling/importance_sampling_ratio/mean": 0.9996682405471802, "sampling/importance_sampling_ratio/min": 0.6506453156471252, "sampling/sampling_logp_difference/max": 0.4297906160354614, "sampling/sampling_logp_difference/mean": 0.01328134536743164, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 177.0625, "completions/mean_terminated_length": 177.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.38742679357528687, "epoch": 2.6323529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.9040913443069275, "kl": 0.04941537231206894, "learning_rate": 4.6171040955092835e-08, "loss": -0.0027, "num_tokens": 67779788.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4697128534317017, "sampling/importance_sampling_ratio/mean": 0.9992687106132507, "sampling/importance_sampling_ratio/min": 0.5127838850021362, "sampling/sampling_logp_difference/max": 0.667900800704956, "sampling/sampling_logp_difference/mean": 0.01535370759665966, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 212.515625, "completions/mean_terminated_length": 212.515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3959088623523712, "epoch": 2.633578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.014901367515449515, "kl": 0.02232431247830391, "learning_rate": 4.587250844206664e-08, "loss": 0.0002, "num_tokens": 67810893.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5291082859039307, "sampling/importance_sampling_ratio/mean": 1.0004322528839111, "sampling/importance_sampling_ratio/min": 0.6461935639381409, "sampling/sampling_logp_difference/max": 0.43665623664855957, "sampling/sampling_logp_difference/mean": 0.01567588374018669, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 179.296875, "completions/mean_terminated_length": 179.296875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4003340005874634, "epoch": 2.6348039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0464033584410158, "kl": 0.05728421360254288, "learning_rate": 4.557489777907836e-08, "loss": 0.0006, "num_tokens": 67838304.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4168375730514526, "sampling/importance_sampling_ratio/mean": 0.9998183250427246, "sampling/importance_sampling_ratio/min": 0.6632786393165588, "sampling/sampling_logp_difference/max": 0.41056013107299805, "sampling/sampling_logp_difference/mean": 0.015619423240423203, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 225.5625, "completions/mean_terminated_length": 225.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3121446967124939, "epoch": 2.6360294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.013397615060628754, "kl": 0.028556915000081062, "learning_rate": 4.527820957025891e-08, "loss": 0.0003, "num_tokens": 67871972.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3516439199447632, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.6623451709747314, "sampling/sampling_logp_difference/max": 0.411968469619751, "sampling/sampling_logp_difference/mean": 0.011982333846390247, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 174.28125, "completions/mean_terminated_length": 174.28125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3555576503276825, "epoch": 2.6372549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.9091776511384866, "kl": 0.029665805399417877, "learning_rate": 4.498244441786675e-08, "loss": 0.0073, "num_tokens": 67899238.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4009780883789062, "sampling/importance_sampling_ratio/mean": 0.9996954202651978, "sampling/importance_sampling_ratio/min": 0.6616666316986084, "sampling/sampling_logp_difference/max": 0.4129934310913086, "sampling/sampling_logp_difference/mean": 0.015006804838776588, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 201.59375, "completions/mean_terminated_length": 201.59375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3351283669471741, "epoch": 2.638480392156863, "frac_reward_zero_std": 0.75, "grad_norm": 0.9198191991067566, "kl": 0.037099454551935196, "learning_rate": 4.4687602922286016e-08, "loss": 0.0048, "num_tokens": 67929516.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4150298833847046, "sampling/importance_sampling_ratio/mean": 0.999969482421875, "sampling/importance_sampling_ratio/min": 0.6585971713066101, "sampling/sampling_logp_difference/max": 0.41764330863952637, "sampling/sampling_logp_difference/mean": 0.01474051084369421, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 236.046875, "completions/mean_terminated_length": 236.046875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.39587363600730896, "epoch": 2.639705882352941, "frac_reward_zero_std": 0.25, "grad_norm": 1.5191392782392061, "kl": 0.03483106195926666, "learning_rate": 4.4393685682026505e-08, "loss": -0.0213, "num_tokens": 67967247.0, "reward": 0.03125, "reward_std": 0.46656501293182373, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5031934976577759, "sampling/importance_sampling_ratio/mean": 0.9996311664581299, "sampling/importance_sampling_ratio/min": 0.6101205945014954, "sampling/sampling_logp_difference/max": 0.4940987229347229, "sampling/sampling_logp_difference/mean": 0.014731589704751968, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 201.8125, "completions/mean_terminated_length": 201.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.45350176095962524, "epoch": 2.6409313725490198, "frac_reward_zero_std": 0.5, "grad_norm": 1.3363351985516962, "kl": 0.03747811168432236, "learning_rate": 4.4100693293721516e-08, "loss": 0.0536, "num_tokens": 67996051.0, "reward": 0.3125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5776199102401733, "sampling/importance_sampling_ratio/mean": 1.0001548528671265, "sampling/importance_sampling_ratio/min": 0.6348408460617065, "sampling/sampling_logp_difference/max": 0.4559173583984375, "sampling/sampling_logp_difference/mean": 0.017571210861206055, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 193.671875, "completions/mean_terminated_length": 193.671875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.38786581158638, "epoch": 2.642156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.8836828028498205, "kl": 0.06262829899787903, "learning_rate": 4.3808626352127066e-08, "loss": -0.0266, "num_tokens": 68027454.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6130213737487793, "sampling/importance_sampling_ratio/mean": 1.0003948211669922, "sampling/importance_sampling_ratio/min": 0.6953710913658142, "sampling/sampling_logp_difference/max": 0.47810912132263184, "sampling/sampling_logp_difference/mean": 0.015348730608820915, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 182.6875, "completions/mean_terminated_length": 182.6875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4030335545539856, "epoch": 2.6433823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.03937552064487977, "kl": 0.09337794780731201, "learning_rate": 4.351748545012057e-08, "loss": 0.0007, "num_tokens": 68056154.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3002655506134033, "sampling/importance_sampling_ratio/mean": 0.9997163414955139, "sampling/importance_sampling_ratio/min": 0.6874826550483704, "sampling/sampling_logp_difference/max": 0.37471866607666016, "sampling/sampling_logp_difference/mean": 0.014955861493945122, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.33162474632263184, "epoch": 2.644607843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.016589471458956482, "kl": 0.021302416920661926, "learning_rate": 4.322727117869951e-08, "loss": 0.0002, "num_tokens": 68086418.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5277650356292725, "sampling/importance_sampling_ratio/mean": 0.9993041157722473, "sampling/importance_sampling_ratio/min": 0.6925998330116272, "sampling/sampling_logp_difference/max": 0.42380595207214355, "sampling/sampling_logp_difference/mean": 0.014312541112303734, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 178.734375, "completions/mean_terminated_length": 178.734375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.345700740814209, "epoch": 2.6458333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.8896953174773692, "kl": 0.034189336001873016, "learning_rate": 4.2937984126980686e-08, "loss": 0.0192, "num_tokens": 68113025.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6207730770111084, "sampling/importance_sampling_ratio/mean": 1.0003600120544434, "sampling/importance_sampling_ratio/min": 0.6178777813911438, "sampling/sampling_logp_difference/max": 0.48290324211120605, "sampling/sampling_logp_difference/mean": 0.014010941609740257, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 239.34375, "completions/mean_terminated_length": 239.34375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.42113956809043884, "epoch": 2.6470588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.7860919483765861, "kl": 0.041868604719638824, "learning_rate": 4.2649624882198196e-08, "loss": 0.0206, "num_tokens": 68148631.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.6585595607757568, "sampling/importance_sampling_ratio/mean": 0.9998828172683716, "sampling/importance_sampling_ratio/min": 0.620280385017395, "sampling/sampling_logp_difference/max": 0.5059494972229004, "sampling/sampling_logp_difference/mean": 0.015877092257142067, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.37317323684692383, "epoch": 2.6482843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.02156110873502598, "kl": 0.032606083899736404, "learning_rate": 4.2362194029703256e-08, "loss": 0.0003, "num_tokens": 68174243.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7885191440582275, "sampling/importance_sampling_ratio/mean": 1.0001945495605469, "sampling/importance_sampling_ratio/min": 0.6998572945594788, "sampling/sampling_logp_difference/max": 0.581387996673584, "sampling/sampling_logp_difference/mean": 0.014830888248980045, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 241.71875, "completions/mean_terminated_length": 241.71875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4071003794670105, "epoch": 2.6495098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.7269248362049141, "kl": 0.03592121601104736, "learning_rate": 4.207569215296214e-08, "loss": 0.0128, "num_tokens": 68207489.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4574626684188843, "sampling/importance_sampling_ratio/mean": 0.9998969435691833, "sampling/importance_sampling_ratio/min": 0.6248587369918823, "sampling/sampling_logp_difference/max": 0.4702296257019043, "sampling/sampling_logp_difference/mean": 0.01433138269931078, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 211.46875, "completions/mean_terminated_length": 211.46875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3052547574043274, "epoch": 2.650735294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.012652424112284191, "kl": 0.01907511055469513, "learning_rate": 4.179011983355568e-08, "loss": 0.0002, "num_tokens": 68245839.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4420716762542725, "sampling/importance_sampling_ratio/mean": 1.0002076625823975, "sampling/importance_sampling_ratio/min": 0.7396199703216553, "sampling/sampling_logp_difference/max": 0.36608076095581055, "sampling/sampling_logp_difference/mean": 0.012432013638317585, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 188.765625, "completions/mean_terminated_length": 188.765625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3332515358924866, "epoch": 2.6519607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0158117777399342, "kl": 0.02347681298851967, "learning_rate": 4.150547765117746e-08, "loss": 0.0002, "num_tokens": 68273536.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4440466165542603, "sampling/importance_sampling_ratio/mean": 0.9997535943984985, "sampling/importance_sampling_ratio/min": 0.44411587715148926, "sampling/sampling_logp_difference/max": 0.8116698265075684, "sampling/sampling_logp_difference/mean": 0.013610436581075191, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 202.796875, "completions/mean_terminated_length": 202.796875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3937033414840698, "epoch": 2.653186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 1.0443737361684446, "kl": 0.04720392823219299, "learning_rate": 4.1221766183633045e-08, "loss": -0.024, "num_tokens": 68310051.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.6483078002929688, "sampling/importance_sampling_ratio/mean": 0.9999201893806458, "sampling/importance_sampling_ratio/min": 0.614530086517334, "sampling/sampling_logp_difference/max": 0.49974918365478516, "sampling/sampling_logp_difference/mean": 0.015470354817807674, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 135.96875, "completions/mean_terminated_length": 135.96875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.23793260753154755, "epoch": 2.6544117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.014384866276741116, "kl": 0.017748642712831497, "learning_rate": 4.0938986006838926e-08, "loss": 0.0002, "num_tokens": 68334193.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007353067398071, "sampling/importance_sampling_ratio/mean": 0.9994776844978333, "sampling/importance_sampling_ratio/min": 0.6216983199119568, "sampling/sampling_logp_difference/max": 0.47530031204223633, "sampling/sampling_logp_difference/mean": 0.011151906102895737, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 206.046875, "completions/mean_terminated_length": 206.046875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.456847608089447, "epoch": 2.655637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.03623937633238227, "kl": 0.05674976482987404, "learning_rate": 4.065713769482082e-08, "loss": 0.0006, "num_tokens": 68366068.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5402013063430786, "sampling/importance_sampling_ratio/mean": 1.0000441074371338, "sampling/importance_sampling_ratio/min": 0.4165976345539093, "sampling/sampling_logp_difference/max": 0.8756344318389893, "sampling/sampling_logp_difference/mean": 0.017204947769641876, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 164.03125, "completions/mean_terminated_length": 164.03125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3916319012641907, "epoch": 2.656862745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.9749861504971195, "kl": 0.03285214304924011, "learning_rate": 4.037622181971295e-08, "loss": 0.0002, "num_tokens": 68393654.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6085870265960693, "sampling/importance_sampling_ratio/mean": 0.9998897314071655, "sampling/importance_sampling_ratio/min": 0.7179526686668396, "sampling/sampling_logp_difference/max": 0.4753561019897461, "sampling/sampling_logp_difference/mean": 0.01594218611717224, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 187.296875, "completions/mean_terminated_length": 187.296875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4151933789253235, "epoch": 2.6580882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 1.0280970633762743, "kl": 0.03589069843292236, "learning_rate": 4.009623895175662e-08, "loss": 0.0279, "num_tokens": 68422777.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6305079460144043, "sampling/importance_sampling_ratio/mean": 0.9992147088050842, "sampling/importance_sampling_ratio/min": 0.4902857542037964, "sampling/sampling_logp_difference/max": 0.7127668857574463, "sampling/sampling_logp_difference/mean": 0.016101282089948654, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 196.53125, "completions/mean_terminated_length": 196.53125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3541117310523987, "epoch": 2.659313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01327160963277398, "kl": 0.02483142912387848, "learning_rate": 3.981718965929959e-08, "loss": 0.0002, "num_tokens": 68457851.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.469534158706665, "sampling/importance_sampling_ratio/mean": 1.0002583265304565, "sampling/importance_sampling_ratio/min": 0.6208575963973999, "sampling/sampling_logp_difference/max": 0.4766535758972168, "sampling/sampling_logp_difference/mean": 0.013692292384803295, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 178.90625, "completions/mean_terminated_length": 178.90625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.33619722723960876, "epoch": 2.6605392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.019335190864553025, "kl": 0.026515118777751923, "learning_rate": 3.953907450879407e-08, "loss": 0.0003, "num_tokens": 68484277.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3703358173370361, "sampling/importance_sampling_ratio/mean": 1.000185489654541, "sampling/importance_sampling_ratio/min": 0.6254917979240417, "sampling/sampling_logp_difference/max": 0.46921706199645996, "sampling/sampling_logp_difference/mean": 0.014374321326613426, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 217.03125, "completions/mean_terminated_length": 217.03125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4130726456642151, "epoch": 2.661764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.015376443809146078, "kl": 0.031047407537698746, "learning_rate": 3.926189406479613e-08, "loss": 0.0003, "num_tokens": 68521319.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5721830129623413, "sampling/importance_sampling_ratio/mean": 1.000622272491455, "sampling/importance_sampling_ratio/min": 0.6926271915435791, "sampling/sampling_logp_difference/max": 0.4524650573730469, "sampling/sampling_logp_difference/mean": 0.014572503045201302, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 186.9375, "completions/mean_terminated_length": 186.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.43158823251724243, "epoch": 2.6629901960784315, "frac_reward_zero_std": 0.5, "grad_norm": 1.2637325352312798, "kl": 0.06611524522304535, "learning_rate": 3.898564888996475e-08, "loss": 0.0187, "num_tokens": 68550451.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.8852076530456543, "sampling/importance_sampling_ratio/mean": 1.0006320476531982, "sampling/importance_sampling_ratio/min": 0.6424117088317871, "sampling/sampling_logp_difference/max": 0.634037971496582, "sampling/sampling_logp_difference/mean": 0.01655680499970913, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 248.96875, "completions/mean_terminated_length": 248.96875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.34359270334243774, "epoch": 2.6642156862745097, "frac_reward_zero_std": 0.5, "grad_norm": 0.8861158320358442, "kl": 0.06602966040372849, "learning_rate": 3.871033954505998e-08, "loss": 0.0358, "num_tokens": 68580289.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.354966402053833, "sampling/importance_sampling_ratio/mean": 0.9997091889381409, "sampling/importance_sampling_ratio/min": 0.2973400950431824, "sampling/sampling_logp_difference/max": 1.212878704071045, "sampling/sampling_logp_difference/mean": 0.013687359169125557, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 174.78125, "completions/mean_terminated_length": 174.78125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3491934835910797, "epoch": 2.6654411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.01935570487676279, "kl": 0.02674412727355957, "learning_rate": 3.843596658894232e-08, "loss": 0.0003, "num_tokens": 68609683.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4011064767837524, "sampling/importance_sampling_ratio/mean": 0.9998527765274048, "sampling/importance_sampling_ratio/min": 0.7132253646850586, "sampling/sampling_logp_difference/max": 0.33795785903930664, "sampling/sampling_logp_difference/mean": 0.013049756176769733, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3384281396865845, "epoch": 2.6666666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.8721263736102155, "kl": 0.058564648032188416, "learning_rate": 3.816253057857144e-08, "loss": 0.0009, "num_tokens": 68636439.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4753774404525757, "sampling/importance_sampling_ratio/mean": 0.9996055364608765, "sampling/importance_sampling_ratio/min": 0.6482266783714294, "sampling/sampling_logp_difference/max": 0.4335148334503174, "sampling/sampling_logp_difference/mean": 0.014768009074032307, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 174.359375, "completions/mean_terminated_length": 174.359375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.38247376680374146, "epoch": 2.667892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.016492188096115544, "kl": 0.023220643401145935, "learning_rate": 3.789003206900537e-08, "loss": 0.0002, "num_tokens": 68669246.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.541258454322815, "sampling/importance_sampling_ratio/mean": 0.9998940229415894, "sampling/importance_sampling_ratio/min": 0.7146031856536865, "sampling/sampling_logp_difference/max": 0.4325993061065674, "sampling/sampling_logp_difference/mean": 0.015042722225189209, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 226.609375, "completions/mean_terminated_length": 226.609375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.406279981136322, "epoch": 2.6691176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.8099497129356059, "kl": 0.02597496472299099, "learning_rate": 3.7618471613398597e-08, "loss": 0.0241, "num_tokens": 68708469.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5932286977767944, "sampling/importance_sampling_ratio/mean": 0.9998207092285156, "sampling/importance_sampling_ratio/min": 0.3276415169239044, "sampling/sampling_logp_difference/max": 1.115835189819336, "sampling/sampling_logp_difference/mean": 0.015545186586678028, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 217.453125, "completions/mean_terminated_length": 217.453125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4454168379306793, "epoch": 2.670343137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.7105919119139977, "kl": 0.028678320348262787, "learning_rate": 3.734784976300165e-08, "loss": -0.0295, "num_tokens": 68744546.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4453904628753662, "sampling/importance_sampling_ratio/mean": 0.9995754361152649, "sampling/importance_sampling_ratio/min": 0.6882218718528748, "sampling/sampling_logp_difference/max": 0.37364399433135986, "sampling/sampling_logp_difference/mean": 0.016125116497278214, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 198.125, "completions/mean_terminated_length": 198.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4045882225036621, "epoch": 2.6715686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.01982988541081753, "kl": 0.02784121036529541, "learning_rate": 3.7078167067159826e-08, "loss": 0.0003, "num_tokens": 68772218.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4922415018081665, "sampling/importance_sampling_ratio/mean": 1.000109314918518, "sampling/importance_sampling_ratio/min": 0.6838484406471252, "sampling/sampling_logp_difference/max": 0.4002794027328491, "sampling/sampling_logp_difference/mean": 0.016974179074168205, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 198.46875, "completions/mean_terminated_length": 198.46875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4650859236717224, "epoch": 2.672794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.01901885931020824, "kl": 0.028293920680880547, "learning_rate": 3.6809424073311944e-08, "loss": 0.0003, "num_tokens": 68804360.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999034404754639, "sampling/importance_sampling_ratio/min": 0.6299536228179932, "sampling/sampling_logp_difference/max": 0.7612266540527344, "sampling/sampling_logp_difference/mean": 0.017532266676425934, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 195.3125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.32273679971694946, "epoch": 2.674019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.7491038958775039, "kl": 0.03821868821978569, "learning_rate": 3.654162132698918e-08, "loss": 0.0003, "num_tokens": 68832028.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3335254192352295, "sampling/importance_sampling_ratio/mean": 1.0005215406417847, "sampling/importance_sampling_ratio/min": 0.6803061366081238, "sampling/sampling_logp_difference/max": 0.38521242141723633, "sampling/sampling_logp_difference/mean": 0.013480523601174355, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 198.4375, "completions/mean_terminated_length": 198.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3952333331108093, "epoch": 2.6752450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.838085929295133, "kl": 0.05547407269477844, "learning_rate": 3.627475937181407e-08, "loss": 0.0126, "num_tokens": 68866056.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3003922700881958, "sampling/importance_sampling_ratio/mean": 0.9999629855155945, "sampling/importance_sampling_ratio/min": 0.47558560967445374, "sampling/sampling_logp_difference/max": 0.7432084083557129, "sampling/sampling_logp_difference/mean": 0.013009263202548027, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 186.984375, "completions/mean_terminated_length": 186.984375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.39547306299209595, "epoch": 2.6764705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.01706766127270785, "kl": 0.024384262040257454, "learning_rate": 3.600883874949967e-08, "loss": 0.0002, "num_tokens": 68893815.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4879440069198608, "sampling/importance_sampling_ratio/mean": 1.0000054836273193, "sampling/importance_sampling_ratio/min": 0.6621916890144348, "sampling/sampling_logp_difference/max": 0.4122002124786377, "sampling/sampling_logp_difference/mean": 0.015582316555082798, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 222.609375, "completions/mean_terminated_length": 222.609375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3833645284175873, "epoch": 2.6776960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.8444642619222315, "kl": 0.023905735462903976, "learning_rate": 3.574385999984786e-08, "loss": -0.0109, "num_tokens": 68926190.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5958937406539917, "sampling/importance_sampling_ratio/mean": 0.9997842311859131, "sampling/importance_sampling_ratio/min": 0.6492242813110352, "sampling/sampling_logp_difference/max": 0.4674339294433594, "sampling/sampling_logp_difference/mean": 0.015461385250091553, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3409844636917114, "epoch": 2.678921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.019021331515509558, "kl": 0.027882926166057587, "learning_rate": 3.54798236607487e-08, "loss": 0.0003, "num_tokens": 68950954.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3148980140686035, "sampling/importance_sampling_ratio/mean": 0.999673068523407, "sampling/importance_sampling_ratio/min": 0.6211196184158325, "sampling/sampling_logp_difference/max": 0.47623157501220703, "sampling/sampling_logp_difference/mean": 0.014010068029165268, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 158.390625, "completions/mean_terminated_length": 158.390625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.4011605978012085, "epoch": 2.6801470588235294, "frac_reward_zero_std": 0.75, "grad_norm": 1.0054874252543198, "kl": 0.0786287933588028, "learning_rate": 3.5216730268179337e-08, "loss": 0.0091, "num_tokens": 68983491.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5853873491287231, "sampling/importance_sampling_ratio/mean": 1.0001468658447266, "sampling/importance_sampling_ratio/min": 0.6642369627952576, "sampling/sampling_logp_difference/max": 0.4608287811279297, "sampling/sampling_logp_difference/mean": 0.01586076244711876, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 184.90625, "completions/mean_terminated_length": 184.90625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3708415627479553, "epoch": 2.681372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.020542560990797393, "kl": 0.030698316171765327, "learning_rate": 3.495458035620252e-08, "loss": 0.0003, "num_tokens": 69012285.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4295809268951416, "sampling/importance_sampling_ratio/mean": 0.9994001388549805, "sampling/importance_sampling_ratio/min": 0.6445699334144592, "sampling/sampling_logp_difference/max": 0.43917202949523926, "sampling/sampling_logp_difference/mean": 0.014541094191372395, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 169.0625, "completions/mean_terminated_length": 169.0625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.41730359196662903, "epoch": 2.6825980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.020414554045821067, "kl": 0.027429096400737762, "learning_rate": 3.469337445696629e-08, "loss": 0.0003, "num_tokens": 69040561.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4753774404525757, "sampling/importance_sampling_ratio/mean": 1.0000537633895874, "sampling/importance_sampling_ratio/min": 0.6311920881271362, "sampling/sampling_logp_difference/max": 0.4601449966430664, "sampling/sampling_logp_difference/mean": 0.016278889030218124, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 189.015625, "completions/mean_terminated_length": 189.015625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4509800970554352, "epoch": 2.6838235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.01918904970116333, "kl": 0.03321604058146477, "learning_rate": 3.4433113100701683e-08, "loss": 0.0003, "num_tokens": 69069698.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4652029275894165, "sampling/importance_sampling_ratio/mean": 0.9996045827865601, "sampling/importance_sampling_ratio/min": 0.679466187953949, "sampling/sampling_logp_difference/max": 0.3864477872848511, "sampling/sampling_logp_difference/mean": 0.017754685133695602, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 215.609375, "completions/mean_terminated_length": 215.609375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3309880197048187, "epoch": 2.685049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.01778985067857809, "kl": 0.026937957853078842, "learning_rate": 3.417379681572296e-08, "loss": 0.0003, "num_tokens": 69101145.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5628186464309692, "sampling/importance_sampling_ratio/mean": 1.000060796737671, "sampling/importance_sampling_ratio/min": 0.5163277387619019, "sampling/sampling_logp_difference/max": 0.6610136032104492, "sampling/sampling_logp_difference/mean": 0.013586295768618584, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.429769366979599, "epoch": 2.686274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.015703317269527433, "kl": 0.026214124634861946, "learning_rate": 3.391542612842574e-08, "loss": 0.0003, "num_tokens": 69133029.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4691550731658936, "sampling/importance_sampling_ratio/mean": 0.9995607137680054, "sampling/importance_sampling_ratio/min": 0.6254510879516602, "sampling/sampling_logp_difference/max": 0.4692821502685547, "sampling/sampling_logp_difference/mean": 0.016176287084817886, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 187.671875, "completions/mean_terminated_length": 187.671875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.32622671127319336, "epoch": 2.6875, "frac_reward_zero_std": 0.75, "grad_norm": 0.9161237725815474, "kl": 0.033167362213134766, "learning_rate": 3.365800156328619e-08, "loss": 0.0177, "num_tokens": 69164480.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5998271703720093, "sampling/importance_sampling_ratio/mean": 0.9996638298034668, "sampling/importance_sampling_ratio/min": 0.5369217395782471, "sampling/sampling_logp_difference/max": 0.6219029426574707, "sampling/sampling_logp_difference/mean": 0.013408014550805092, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 200.609375, "completions/mean_terminated_length": 200.609375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4098494052886963, "epoch": 2.688725490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.03817098702871339, "kl": 0.025800131261348724, "learning_rate": 3.3401523642859805e-08, "loss": 0.0002, "num_tokens": 69198823.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3600707054138184, "sampling/importance_sampling_ratio/mean": 0.9994595050811768, "sampling/importance_sampling_ratio/min": 0.6952628493309021, "sampling/sampling_logp_difference/max": 0.3634653091430664, "sampling/sampling_logp_difference/mean": 0.01569507271051407, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 183.734375, "completions/mean_terminated_length": 183.734375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3755292296409607, "epoch": 2.689950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9980885619171185, "kl": 0.026967518031597137, "learning_rate": 3.3145992887780475e-08, "loss": -0.0563, "num_tokens": 69227302.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4975247383117676, "sampling/importance_sampling_ratio/mean": 0.998956024646759, "sampling/importance_sampling_ratio/min": 0.5228726863861084, "sampling/sampling_logp_difference/max": 0.6484172344207764, "sampling/sampling_logp_difference/mean": 0.0146570960059762, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.35742872953414917, "epoch": 2.6911764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.010959171189284437, "kl": 0.01894654706120491, "learning_rate": 3.289140981675964e-08, "loss": 0.0002, "num_tokens": 69260462.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6141589879989624, "sampling/importance_sampling_ratio/mean": 1.0003001689910889, "sampling/importance_sampling_ratio/min": 0.6132730841636658, "sampling/sampling_logp_difference/max": 0.48894500732421875, "sampling/sampling_logp_difference/mean": 0.01310389768332243, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 245.640625, "completions/mean_terminated_length": 245.640625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4439132809638977, "epoch": 2.6924019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 1.0722411814635506, "kl": 0.0608699694275856, "learning_rate": 3.263777494658448e-08, "loss": 0.0081, "num_tokens": 69297735.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5792343616485596, "sampling/importance_sampling_ratio/mean": 1.0005708932876587, "sampling/importance_sampling_ratio/min": 0.6708058714866638, "sampling/sampling_logp_difference/max": 0.4569401741027832, "sampling/sampling_logp_difference/mean": 0.014909801073372364, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.33329761028289795, "epoch": 2.693627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.014752819647576854, "kl": 0.02324068918824196, "learning_rate": 3.2385088792118044e-08, "loss": 0.0002, "num_tokens": 69327407.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6179770231246948, "sampling/importance_sampling_ratio/mean": 1.0001076459884644, "sampling/importance_sampling_ratio/min": 0.6605718731880188, "sampling/sampling_logp_difference/max": 0.48117661476135254, "sampling/sampling_logp_difference/mean": 0.014407221227884293, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 235.28125, "completions/mean_terminated_length": 235.28125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.41965699195861816, "epoch": 2.6948529411764706, "frac_reward_zero_std": 0.5, "grad_norm": 1.0650251009076923, "kl": 0.04170294106006622, "learning_rate": 3.2133351866296955e-08, "loss": -0.0279, "num_tokens": 69362321.0, "reward": 0.25, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5023787021636963, "sampling/importance_sampling_ratio/mean": 1.0000550746917725, "sampling/importance_sampling_ratio/min": 0.6670454740524292, "sampling/sampling_logp_difference/max": 0.40704965591430664, "sampling/sampling_logp_difference/mean": 0.014436185359954834, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 221.9375, "completions/mean_terminated_length": 221.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.37368834018707275, "epoch": 2.696078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.012657824622548918, "kl": 0.018766485154628754, "learning_rate": 3.188256468013139e-08, "loss": 0.0002, "num_tokens": 69394909.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4549809694290161, "sampling/importance_sampling_ratio/mean": 1.0002429485321045, "sampling/importance_sampling_ratio/min": 0.6784660816192627, "sampling/sampling_logp_difference/max": 0.3879207372665405, "sampling/sampling_logp_difference/mean": 0.013512177392840385, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 181.984375, "completions/mean_terminated_length": 181.984375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.44905218482017517, "epoch": 2.6973039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.373832166490122, "kl": 0.04382198676466942, "learning_rate": 3.163272774270348e-08, "loss": -0.0308, "num_tokens": 69420956.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5140695571899414, "sampling/importance_sampling_ratio/mean": 1.000220775604248, "sampling/importance_sampling_ratio/min": 0.6410974264144897, "sampling/sampling_logp_difference/max": 0.44457387924194336, "sampling/sampling_logp_difference/mean": 0.01640354096889496, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 145.078125, "completions/mean_terminated_length": 145.078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.35924413800239563, "epoch": 2.6985294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 1.2076341985162982, "kl": 0.07685059309005737, "learning_rate": 3.1383841561166134e-08, "loss": -0.0304, "num_tokens": 69442785.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6274707317352295, "sampling/importance_sampling_ratio/mean": 1.0003020763397217, "sampling/importance_sampling_ratio/min": 0.6347670555114746, "sampling/sampling_logp_difference/max": 0.4870271682739258, "sampling/sampling_logp_difference/mean": 0.015782007947564125, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 197.46875, "completions/mean_terminated_length": 197.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4621891975402832, "epoch": 2.6997549019607843, "frac_reward_zero_std": 0.75, "grad_norm": 0.8831331827097225, "kl": 0.053350359201431274, "learning_rate": 3.1135906640742836e-08, "loss": 0.0105, "num_tokens": 69473791.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5071722269058228, "sampling/importance_sampling_ratio/mean": 0.9998569488525391, "sampling/importance_sampling_ratio/min": 0.6598401665687561, "sampling/sampling_logp_difference/max": 0.4157576560974121, "sampling/sampling_logp_difference/mean": 0.016618210822343826, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 205.828125, "completions/mean_terminated_length": 205.828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3371545076370239, "epoch": 2.700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.013762829006671326, "kl": 0.020944753661751747, "learning_rate": 3.088892348472561e-08, "loss": 0.0002, "num_tokens": 69505444.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6021487712860107, "sampling/importance_sampling_ratio/mean": 0.9997724294662476, "sampling/importance_sampling_ratio/min": 0.6512129306793213, "sampling/sampling_logp_difference/max": 0.4713456630706787, "sampling/sampling_logp_difference/mean": 0.013815833255648613, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 205.234375, "completions/mean_terminated_length": 205.234375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4217361509799957, "epoch": 2.702205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.8403160454799576, "kl": 0.03663209453225136, "learning_rate": 3.064289259447455e-08, "loss": -0.0207, "num_tokens": 69532451.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4563947916030884, "sampling/importance_sampling_ratio/mean": 1.000288486480713, "sampling/importance_sampling_ratio/min": 0.6954967975616455, "sampling/sampling_logp_difference/max": 0.37596404552459717, "sampling/sampling_logp_difference/mean": 0.0165574848651886, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 227.109375, "completions/mean_terminated_length": 227.109375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.5034979581832886, "epoch": 2.7034313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.8202365148511331, "kl": 0.04638688638806343, "learning_rate": 3.039781446941697e-08, "loss": 0.0061, "num_tokens": 69566266.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4162613153457642, "sampling/importance_sampling_ratio/mean": 1.0002670288085938, "sampling/importance_sampling_ratio/min": 0.7326375842094421, "sampling/sampling_logp_difference/max": 0.3480205535888672, "sampling/sampling_logp_difference/mean": 0.016674406826496124, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 178.96875, "completions/mean_terminated_length": 178.96875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4002281427383423, "epoch": 2.704656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.021476810741575578, "kl": 0.02550198882818222, "learning_rate": 3.015368960704584e-08, "loss": 0.0003, "num_tokens": 69595512.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4954711198806763, "sampling/importance_sampling_ratio/mean": 0.9999463558197021, "sampling/importance_sampling_ratio/min": 0.602114737033844, "sampling/sampling_logp_difference/max": 0.5073072910308838, "sampling/sampling_logp_difference/mean": 0.0180397629737854, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 227.09375, "completions/mean_terminated_length": 227.09375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.36214518547058105, "epoch": 2.7058823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.5992061058659258, "kl": 0.03874623402953148, "learning_rate": 2.991051850291915e-08, "loss": 0.0026, "num_tokens": 69624574.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5040321350097656, "sampling/importance_sampling_ratio/mean": 0.9997340440750122, "sampling/importance_sampling_ratio/min": 0.6141670942306519, "sampling/sampling_logp_difference/max": 0.4874882698059082, "sampling/sampling_logp_difference/mean": 0.014499923214316368, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3309875726699829, "epoch": 2.707107843137255, "frac_reward_zero_std": 0.75, "grad_norm": 1.0591527126619875, "kl": 0.027056990191340446, "learning_rate": 2.9668301650658756e-08, "loss": -0.019, "num_tokens": 69659990.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3536651134490967, "sampling/importance_sampling_ratio/mean": 1.0002684593200684, "sampling/importance_sampling_ratio/min": 0.5260624885559082, "sampling/sampling_logp_difference/max": 0.6423352956771851, "sampling/sampling_logp_difference/mean": 0.01373043842613697, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 188.71875, "completions/mean_terminated_length": 188.71875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3033469319343567, "epoch": 2.7083333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.011828145824959092, "kl": 0.019223835319280624, "learning_rate": 2.9427039541949638e-08, "loss": 0.0002, "num_tokens": 69688420.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4354578256607056, "sampling/importance_sampling_ratio/mean": 0.9997765421867371, "sampling/importance_sampling_ratio/min": 0.6278591156005859, "sampling/sampling_logp_difference/max": 0.4654395580291748, "sampling/sampling_logp_difference/mean": 0.013137388974428177, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 185.4375, "completions/mean_terminated_length": 185.4375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3659963607788086, "epoch": 2.7095588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01739920974455112, "kl": 0.028903864324092865, "learning_rate": 2.918673266653865e-08, "loss": 0.0003, "num_tokens": 69718048.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6234164237976074, "sampling/importance_sampling_ratio/mean": 0.9997466802597046, "sampling/importance_sampling_ratio/min": 0.5808159708976746, "sampling/sampling_logp_difference/max": 0.5433213710784912, "sampling/sampling_logp_difference/mean": 0.014746871776878834, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 187.859375, "completions/mean_terminated_length": 187.859375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.39722689986228943, "epoch": 2.7107843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8294509745509189, "kl": 0.05414985120296478, "learning_rate": 2.8947381512233305e-08, "loss": -0.0086, "num_tokens": 69747591.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5189282894134521, "sampling/importance_sampling_ratio/mean": 0.9999840259552002, "sampling/importance_sampling_ratio/min": 0.577173113822937, "sampling/sampling_logp_difference/max": 0.5496129989624023, "sampling/sampling_logp_difference/mean": 0.016706984490156174, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 197.46875, "completions/mean_terminated_length": 197.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4055330753326416, "epoch": 2.7120098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 1.0365294035465076, "kl": 0.04686344042420387, "learning_rate": 2.8708986564901504e-08, "loss": 0.016, "num_tokens": 69778485.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999788761138916, "sampling/importance_sampling_ratio/min": 0.680418848991394, "sampling/sampling_logp_difference/max": 0.709975004196167, "sampling/sampling_logp_difference/mean": 0.015084546059370041, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 187.78125, "completions/mean_terminated_length": 187.78125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.2596806287765503, "epoch": 2.713235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.016154535797120406, "kl": 0.019631171599030495, "learning_rate": 2.8471548308469706e-08, "loss": 0.0002, "num_tokens": 69802615.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6073639392852783, "sampling/importance_sampling_ratio/mean": 0.9996476173400879, "sampling/importance_sampling_ratio/min": 0.6554355621337891, "sampling/sampling_logp_difference/max": 0.4745955467224121, "sampling/sampling_logp_difference/mean": 0.013249853625893593, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 183.921875, "completions/mean_terminated_length": 183.921875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3703579902648926, "epoch": 2.7144607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.930933398269968, "kl": 0.04081626236438751, "learning_rate": 2.8235067224922802e-08, "loss": 0.0157, "num_tokens": 69829122.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4594871997833252, "sampling/importance_sampling_ratio/mean": 0.9999802708625793, "sampling/importance_sampling_ratio/min": 0.6332386136054993, "sampling/sampling_logp_difference/max": 0.4569079875946045, "sampling/sampling_logp_difference/mean": 0.015322159975767136, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 233.03125, "completions/mean_terminated_length": 233.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.36985495686531067, "epoch": 2.715686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.018448594596314177, "kl": 0.022692425176501274, "learning_rate": 2.799954379430208e-08, "loss": 0.0002, "num_tokens": 69865732.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5356004238128662, "sampling/importance_sampling_ratio/mean": 1.0003145933151245, "sampling/importance_sampling_ratio/min": 0.6561621427536011, "sampling/sampling_logp_difference/max": 0.4289214611053467, "sampling/sampling_logp_difference/mean": 0.013818234205245972, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 172.84375, "completions/mean_terminated_length": 172.84375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.45772528648376465, "epoch": 2.7169117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.021401507381491623, "kl": 0.030462948605418205, "learning_rate": 2.7764978494705437e-08, "loss": 0.0003, "num_tokens": 69897178.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001213550567627, "sampling/importance_sampling_ratio/min": 0.6583269238471985, "sampling/sampling_logp_difference/max": 0.9321606159210205, "sampling/sampling_logp_difference/mean": 0.017726056277751923, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 167.3125, "completions/mean_terminated_length": 167.3125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.27990877628326416, "epoch": 2.718137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.014808584355121727, "kl": 0.024075474590063095, "learning_rate": 2.753137180228543e-08, "loss": 0.0002, "num_tokens": 69920606.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6250072717666626, "sampling/importance_sampling_ratio/mean": 1.0006778240203857, "sampling/importance_sampling_ratio/min": 0.6498124599456787, "sampling/sampling_logp_difference/max": 0.48551225662231445, "sampling/sampling_logp_difference/mean": 0.012928958982229233, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 246.40625, "completions/mean_terminated_length": 246.40625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.43565645813941956, "epoch": 2.719362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.024239787636330484, "kl": 0.04958684742450714, "learning_rate": 2.729872419124879e-08, "loss": 0.0005, "num_tokens": 69953832.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3965340852737427, "sampling/importance_sampling_ratio/mean": 0.9995464086532593, "sampling/importance_sampling_ratio/min": 0.6970493793487549, "sampling/sampling_logp_difference/max": 0.3608989715576172, "sampling/sampling_logp_difference/mean": 0.014671550132334232, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 169.765625, "completions/mean_terminated_length": 169.765625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.370225727558136, "epoch": 2.7205882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.025148167441551832, "kl": 0.029639123007655144, "learning_rate": 2.7067036133855636e-08, "loss": 0.0003, "num_tokens": 69983833.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3967820405960083, "sampling/importance_sampling_ratio/mean": 1.0000736713409424, "sampling/importance_sampling_ratio/min": 0.6781371235847473, "sampling/sampling_logp_difference/max": 0.38840579986572266, "sampling/sampling_logp_difference/mean": 0.015588011592626572, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.36063680052757263, "epoch": 2.721813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.02809174824148526, "kl": 0.04175316542387009, "learning_rate": 2.6836308100417872e-08, "loss": 0.0005, "num_tokens": 70012609.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7515075206756592, "sampling/importance_sampling_ratio/mean": 1.0000413656234741, "sampling/importance_sampling_ratio/min": 0.6815227270126343, "sampling/sampling_logp_difference/max": 0.5604767799377441, "sampling/sampling_logp_difference/mean": 0.014000261202454567, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 173.265625, "completions/mean_terminated_length": 173.265625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3790750205516815, "epoch": 2.7230392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.017831589623363682, "kl": 0.039492081850767136, "learning_rate": 2.6606540559298952e-08, "loss": 0.0004, "num_tokens": 70042258.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.531073808670044, "sampling/importance_sampling_ratio/mean": 1.0003293752670288, "sampling/importance_sampling_ratio/min": 0.6711210012435913, "sampling/sampling_logp_difference/max": 0.42596936225891113, "sampling/sampling_logp_difference/mean": 0.013877512887120247, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 189.171875, "completions/mean_terminated_length": 189.171875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3294864892959595, "epoch": 2.724264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.016183732643314658, "kl": 0.022651515901088715, "learning_rate": 2.6377733976912232e-08, "loss": 0.0002, "num_tokens": 70068909.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3656457662582397, "sampling/importance_sampling_ratio/mean": 1.000307321548462, "sampling/importance_sampling_ratio/min": 0.6056224703788757, "sampling/sampling_logp_difference/max": 0.5014984607696533, "sampling/sampling_logp_difference/mean": 0.01442730613052845, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 204.765625, "completions/mean_terminated_length": 204.765625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.35736751556396484, "epoch": 2.7254901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.019247193422217267, "kl": 0.027182430028915405, "learning_rate": 2.6149888817720733e-08, "loss": 0.0003, "num_tokens": 70101326.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6362637281417847, "sampling/importance_sampling_ratio/mean": 0.9998779892921448, "sampling/importance_sampling_ratio/min": 0.6739427447319031, "sampling/sampling_logp_difference/max": 0.4924154281616211, "sampling/sampling_logp_difference/mean": 0.014462153427302837, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 212.390625, "completions/mean_terminated_length": 212.390625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.38081106543540955, "epoch": 2.7267156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.013739196201482876, "kl": 0.021417230367660522, "learning_rate": 2.5923005544235545e-08, "loss": 0.0002, "num_tokens": 70138359.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.567293643951416, "sampling/importance_sampling_ratio/mean": 1.0004489421844482, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4679851531982422, "sampling/sampling_logp_difference/mean": 0.014387840405106544, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 326.515625, "completions/mean_terminated_length": 326.515625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4487899839878082, "epoch": 2.7279411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.5790579301037666, "kl": 0.03546018898487091, "learning_rate": 2.5697084617015475e-08, "loss": -0.006, "num_tokens": 70182664.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.5986748933792114, "sampling/importance_sampling_ratio/mean": 0.9997212290763855, "sampling/importance_sampling_ratio/min": 0.7025700807571411, "sampling/sampling_logp_difference/max": 0.4691751003265381, "sampling/sampling_logp_difference/mean": 0.0149573078379035, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 205.171875, "completions/mean_terminated_length": 205.171875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.39314544200897217, "epoch": 2.7291666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.044041662114271914, "kl": 0.05147264525294304, "learning_rate": 2.547212649466568e-08, "loss": 0.0006, "num_tokens": 70217875.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6088998317718506, "sampling/importance_sampling_ratio/mean": 0.999459445476532, "sampling/importance_sampling_ratio/min": 0.42561814188957214, "sampling/sampling_logp_difference/max": 0.854212760925293, "sampling/sampling_logp_difference/mean": 0.015132260508835316, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.38113486766815186, "epoch": 2.730392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 1.0513687121081103, "kl": 0.035049207508563995, "learning_rate": 2.5248131633836823e-08, "loss": 0.0093, "num_tokens": 70250679.0, "reward": -0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5746089220046997, "sampling/importance_sampling_ratio/mean": 0.9993321299552917, "sampling/importance_sampling_ratio/min": 0.5483723282814026, "sampling/sampling_logp_difference/max": 0.6008007526397705, "sampling/sampling_logp_difference/mean": 0.01642269268631935, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 220.15625, "completions/mean_terminated_length": 220.15625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.47911232709884644, "epoch": 2.7316176470588234, "frac_reward_zero_std": 0.5, "grad_norm": 1.2832749805390427, "kl": 0.04859183728694916, "learning_rate": 2.5025100489224406e-08, "loss": -0.0241, "num_tokens": 70283617.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5856502056121826, "sampling/importance_sampling_ratio/mean": 0.9997179508209229, "sampling/importance_sampling_ratio/min": 0.6418494582176208, "sampling/sampling_logp_difference/max": 0.4609944820404053, "sampling/sampling_logp_difference/mean": 0.017129074782133102, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 237.40625, "completions/mean_terminated_length": 237.40625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4551759958267212, "epoch": 2.732843137254902, "frac_reward_zero_std": 0.75, "grad_norm": 0.8730456578928805, "kl": 0.07192547619342804, "learning_rate": 2.480303351356733e-08, "loss": 0.0153, "num_tokens": 70320075.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4353076219558716, "sampling/importance_sampling_ratio/mean": 0.9998223781585693, "sampling/importance_sampling_ratio/min": 0.6555653810501099, "sampling/sampling_logp_difference/max": 0.4222571849822998, "sampling/sampling_logp_difference/mean": 0.015772780403494835, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 167.3125, "completions/mean_terminated_length": 167.3125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3926674723625183, "epoch": 2.7340686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.018773129247530156, "kl": 0.028011813759803772, "learning_rate": 2.4581931157647674e-08, "loss": 0.0003, "num_tokens": 70348191.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6062246561050415, "sampling/importance_sampling_ratio/mean": 1.0012869834899902, "sampling/importance_sampling_ratio/min": 0.6412144899368286, "sampling/sampling_logp_difference/max": 0.47388648986816406, "sampling/sampling_logp_difference/mean": 0.015152723528444767, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 200.546875, "completions/mean_terminated_length": 200.546875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3412466049194336, "epoch": 2.735294117647059, "frac_reward_zero_std": 0.75, "grad_norm": 0.7558585756028967, "kl": 0.023498691618442535, "learning_rate": 2.4361793870289028e-08, "loss": 0.0066, "num_tokens": 70379170.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3610817193984985, "sampling/importance_sampling_ratio/mean": 0.9996642470359802, "sampling/importance_sampling_ratio/min": 0.5478037595748901, "sampling/sampling_logp_difference/max": 0.6018381118774414, "sampling/sampling_logp_difference/mean": 0.013857526704668999, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.35404205322265625, "epoch": 2.736519607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.01642056073563973, "kl": 0.028945980593562126, "learning_rate": 2.4142622098356326e-08, "loss": 0.0003, "num_tokens": 70411066.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6273410320281982, "sampling/importance_sampling_ratio/mean": 1.0002670288085938, "sampling/importance_sampling_ratio/min": 0.6700373888015747, "sampling/sampling_logp_difference/max": 0.4869474172592163, "sampling/sampling_logp_difference/mean": 0.014044288545846939, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 203.359375, "completions/mean_terminated_length": 203.359375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3403061032295227, "epoch": 2.7377450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.016057319750129643, "kl": 0.023418106138706207, "learning_rate": 2.3924416286754345e-08, "loss": 0.0002, "num_tokens": 70439361.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 1.000166416168213, "sampling/importance_sampling_ratio/min": 0.6251910924911499, "sampling/sampling_logp_difference/max": 0.48689985275268555, "sampling/sampling_logp_difference/mean": 0.014232734218239784, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 233.328125, "completions/mean_terminated_length": 233.328125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2991639971733093, "epoch": 2.7389705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.012331143078647135, "kl": 0.01602299138903618, "learning_rate": 2.3707176878426882e-08, "loss": 0.0002, "num_tokens": 70472294.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.436407446861267, "sampling/importance_sampling_ratio/mean": 0.9996287822723389, "sampling/importance_sampling_ratio/min": 0.3998073935508728, "sampling/sampling_logp_difference/max": 0.9167723655700684, "sampling/sampling_logp_difference/mean": 0.01243562251329422, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 170.71875, "completions/mean_terminated_length": 170.71875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.34510964155197144, "epoch": 2.7401960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.015965299142909544, "kl": 0.02148594707250595, "learning_rate": 2.3490904314356407e-08, "loss": 0.0002, "num_tokens": 70500948.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4352436065673828, "sampling/importance_sampling_ratio/mean": 0.9997973442077637, "sampling/importance_sampling_ratio/min": 0.6628729701042175, "sampling/sampling_logp_difference/max": 0.41117191314697266, "sampling/sampling_logp_difference/mean": 0.01357237994670868, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 208.71875, "completions/mean_terminated_length": 208.71875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3250342309474945, "epoch": 2.741421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.013885240972677616, "kl": 0.01940862461924553, "learning_rate": 2.327559903356241e-08, "loss": 0.0002, "num_tokens": 70539666.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4104986190795898, "sampling/importance_sampling_ratio/mean": 1.0001335144042969, "sampling/importance_sampling_ratio/min": 0.6416339874267578, "sampling/sampling_logp_difference/max": 0.443737268447876, "sampling/sampling_logp_difference/mean": 0.012945730239152908, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 198.640625, "completions/mean_terminated_length": 198.640625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.33332884311676025, "epoch": 2.7426470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.012336247457790443, "kl": 0.02224317193031311, "learning_rate": 2.3061261473101002e-08, "loss": 0.0002, "num_tokens": 70574011.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.618847131729126, "sampling/importance_sampling_ratio/mean": 0.9995735883712769, "sampling/importance_sampling_ratio/min": 0.6375756859779358, "sampling/sampling_logp_difference/max": 0.48171424865722656, "sampling/sampling_logp_difference/mean": 0.01353538315743208, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 173.71875, "completions/mean_terminated_length": 173.71875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4061875641345978, "epoch": 2.743872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.7488505424053589, "kl": 0.03188773989677429, "learning_rate": 2.2847892068063755e-08, "loss": 0.0138, "num_tokens": 70606857.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6663976907730103, "sampling/importance_sampling_ratio/mean": 1.0003501176834106, "sampling/importance_sampling_ratio/min": 0.6072332859039307, "sampling/sampling_logp_difference/max": 0.5106642246246338, "sampling/sampling_logp_difference/mean": 0.017464924603700638, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 200.40625, "completions/mean_terminated_length": 200.40625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4441275894641876, "epoch": 2.7450980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.0628466768420897, "kl": 0.03030751273036003, "learning_rate": 2.263549125157721e-08, "loss": 0.0251, "num_tokens": 70640419.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5233913660049438, "sampling/importance_sampling_ratio/mean": 1.000167965888977, "sampling/importance_sampling_ratio/min": 0.7082679271697998, "sampling/sampling_logp_difference/max": 0.42093896865844727, "sampling/sampling_logp_difference/mean": 0.015648623928427696, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.386197566986084, "epoch": 2.7463235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.8935470667623276, "kl": 0.026635531336069107, "learning_rate": 2.242405945480147e-08, "loss": 0.0007, "num_tokens": 70668179.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3671354055404663, "sampling/importance_sampling_ratio/mean": 1.0001649856567383, "sampling/importance_sampling_ratio/min": 0.7245340347290039, "sampling/sampling_logp_difference/max": 0.32222652435302734, "sampling/sampling_logp_difference/mean": 0.015021894127130508, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 195.84375, "completions/mean_terminated_length": 195.84375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3563383221626282, "epoch": 2.747549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.013005726344480849, "kl": 0.019569067284464836, "learning_rate": 2.2213597106929605e-08, "loss": 0.0002, "num_tokens": 70703049.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5812333822250366, "sampling/importance_sampling_ratio/mean": 1.0001039505004883, "sampling/importance_sampling_ratio/min": 0.6924002170562744, "sampling/sampling_logp_difference/max": 0.4582052230834961, "sampling/sampling_logp_difference/mean": 0.014485219493508339, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 185.890625, "completions/mean_terminated_length": 185.890625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.35772231221199036, "epoch": 2.748774509803922, "frac_reward_zero_std": 0.5, "grad_norm": 1.287825080029086, "kl": 0.03768147900700569, "learning_rate": 2.200410463518704e-08, "loss": 0.0004, "num_tokens": 70730034.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3495383262634277, "sampling/importance_sampling_ratio/mean": 0.9996664524078369, "sampling/importance_sampling_ratio/min": 0.6629447340965271, "sampling/sampling_logp_difference/max": 0.41106367111206055, "sampling/sampling_logp_difference/mean": 0.014889972284436226, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 238.703125, "completions/mean_terminated_length": 238.703125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3390355706214905, "epoch": 2.75, "frac_reward_zero_std": 0.75, "grad_norm": 0.7348305087159989, "kl": 0.026839446276426315, "learning_rate": 2.1795582464830153e-08, "loss": -0.0189, "num_tokens": 70760751.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3670462369918823, "sampling/importance_sampling_ratio/mean": 1.0002970695495605, "sampling/importance_sampling_ratio/min": 0.6226705312728882, "sampling/sampling_logp_difference/max": 0.4737377166748047, "sampling/sampling_logp_difference/mean": 0.013434633612632751, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2933042645454407, "epoch": 2.751225490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.023499611836145, "kl": 0.05593154579401016, "learning_rate": 2.1588031019145636e-08, "loss": 0.0023, "num_tokens": 70786807.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.374788522720337, "sampling/importance_sampling_ratio/mean": 1.0001814365386963, "sampling/importance_sampling_ratio/min": 0.6192825436592102, "sampling/sampling_logp_difference/max": 0.47919368743896484, "sampling/sampling_logp_difference/mean": 0.012308412231504917, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 214.828125, "completions/mean_terminated_length": 214.828125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3517012596130371, "epoch": 2.752450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.012697852038452596, "kl": 0.019416378811001778, "learning_rate": 2.13814507194498e-08, "loss": 0.0002, "num_tokens": 70817516.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.44302237033844, "sampling/importance_sampling_ratio/mean": 1.000140905380249, "sampling/importance_sampling_ratio/min": 0.6577356457710266, "sampling/sampling_logp_difference/max": 0.41895222663879395, "sampling/sampling_logp_difference/mean": 0.01361516211181879, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 216.109375, "completions/mean_terminated_length": 216.109375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.5082956552505493, "epoch": 2.7536764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.9445501706534202, "kl": 0.055819157510995865, "learning_rate": 2.1175841985087707e-08, "loss": -0.0334, "num_tokens": 70852131.0, "reward": -0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.6112245321273804, "sampling/importance_sampling_ratio/mean": 1.000665545463562, "sampling/importance_sampling_ratio/min": 0.618553876876831, "sampling/sampling_logp_difference/max": 0.48037099838256836, "sampling/sampling_logp_difference/mean": 0.018055545166134834, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 214.78125, "completions/mean_terminated_length": 214.78125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.37545245885849, "epoch": 2.7549019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 1.0201676838857112, "kl": 0.024367734789848328, "learning_rate": 2.097120523343199e-08, "loss": 0.0087, "num_tokens": 70884005.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5336076021194458, "sampling/importance_sampling_ratio/mean": 0.9998330473899841, "sampling/importance_sampling_ratio/min": 0.6409609913825989, "sampling/sampling_logp_difference/max": 0.4447866678237915, "sampling/sampling_logp_difference/mean": 0.014355825260281563, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 161.359375, "completions/mean_terminated_length": 161.359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.42221778631210327, "epoch": 2.756127450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.9751308068647037, "kl": 0.047976039350032806, "learning_rate": 2.076754087988214e-08, "loss": 0.0062, "num_tokens": 70909356.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.541296362876892, "sampling/importance_sampling_ratio/mean": 0.9989932775497437, "sampling/importance_sampling_ratio/min": 0.6262674927711487, "sampling/sampling_logp_difference/max": 0.46797776222229004, "sampling/sampling_logp_difference/mean": 0.016862986609339714, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 216.328125, "completions/mean_terminated_length": 216.328125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.28034213185310364, "epoch": 2.7573529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0129235382913119, "kl": 0.016742991283535957, "learning_rate": 2.0564849337864122e-08, "loss": 0.0002, "num_tokens": 70940945.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5344040393829346, "sampling/importance_sampling_ratio/mean": 0.9997872114181519, "sampling/importance_sampling_ratio/min": 0.6181705594062805, "sampling/sampling_logp_difference/max": 0.4809908866882324, "sampling/sampling_logp_difference/mean": 0.011822382919490337, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3743704557418823, "epoch": 2.758578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.020817038768702053, "kl": 0.026140248402953148, "learning_rate": 2.036313101882875e-08, "loss": 0.0003, "num_tokens": 70977385.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.324415922164917, "sampling/importance_sampling_ratio/mean": 1.0000091791152954, "sampling/importance_sampling_ratio/min": 0.6054502129554749, "sampling/sampling_logp_difference/max": 0.5017828941345215, "sampling/sampling_logp_difference/mean": 0.01400675904005766, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 209.3125, "completions/mean_terminated_length": 209.3125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.43410447239875793, "epoch": 2.7598039215686274, "frac_reward_zero_std": 0.5, "grad_norm": 1.1545171007395325, "kl": 0.044753510504961014, "learning_rate": 2.0162386332251648e-08, "loss": -0.0003, "num_tokens": 71011165.0, "reward": 0.125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.521191120147705, "sampling/importance_sampling_ratio/mean": 0.9995642900466919, "sampling/importance_sampling_ratio/min": 0.6394612789154053, "sampling/sampling_logp_difference/max": 0.4471292495727539, "sampling/sampling_logp_difference/mean": 0.01565314084291458, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 224.984375, "completions/mean_terminated_length": 224.984375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4113616645336151, "epoch": 2.7610294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.1315484898630275, "kl": 0.03205491602420807, "learning_rate": 1.9962615685631568e-08, "loss": 0.031, "num_tokens": 71042924.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4372361898422241, "sampling/importance_sampling_ratio/mean": 1.000169277191162, "sampling/importance_sampling_ratio/min": 0.6896570324897766, "sampling/sampling_logp_difference/max": 0.3715609312057495, "sampling/sampling_logp_difference/mean": 0.014121998101472855, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 198.265625, "completions/mean_terminated_length": 198.265625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3452353775501251, "epoch": 2.7622549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.037089866348583675, "kl": 0.03471102565526962, "learning_rate": 1.976381948449035e-08, "loss": 0.0004, "num_tokens": 71079693.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6159518957138062, "sampling/importance_sampling_ratio/mean": 0.9987235069274902, "sampling/importance_sampling_ratio/min": 0.6117444038391113, "sampling/sampling_logp_difference/max": 0.4914407730102539, "sampling/sampling_logp_difference/mean": 0.014984086155891418, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 191.609375, "completions/mean_terminated_length": 191.609375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4518758952617645, "epoch": 2.763480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.030282372411408555, "kl": 0.06672383844852448, "learning_rate": 1.9565998132371808e-08, "loss": 0.0007, "num_tokens": 71114324.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5612155199050903, "sampling/importance_sampling_ratio/mean": 1.0006022453308105, "sampling/importance_sampling_ratio/min": 0.6825100779533386, "sampling/sampling_logp_difference/max": 0.44546473026275635, "sampling/sampling_logp_difference/mean": 0.01747298426926136, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 242.953125, "completions/mean_terminated_length": 242.953125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3945736587047577, "epoch": 2.764705882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.787144812149263, "kl": 0.03934440016746521, "learning_rate": 1.936915203084055e-08, "loss": 0.0112, "num_tokens": 71149137.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.582056999206543, "sampling/importance_sampling_ratio/mean": 1.000464916229248, "sampling/importance_sampling_ratio/min": 0.6629458665847778, "sampling/sampling_logp_difference/max": 0.4587259292602539, "sampling/sampling_logp_difference/mean": 0.015069615095853806, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 206.421875, "completions/mean_terminated_length": 206.421875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.39079004526138306, "epoch": 2.7659313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.08122842759486407, "kl": 0.04704172909259796, "learning_rate": 1.9173281579481894e-08, "loss": 0.0005, "num_tokens": 71179676.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5323126316070557, "sampling/importance_sampling_ratio/mean": 1.0000157356262207, "sampling/importance_sampling_ratio/min": 0.6178263425827026, "sampling/sampling_logp_difference/max": 0.48154783248901367, "sampling/sampling_logp_difference/mean": 0.014137955382466316, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 189.390625, "completions/mean_terminated_length": 189.390625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.4201244115829468, "epoch": 2.767156862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.9584494048322517, "kl": 0.03773742541670799, "learning_rate": 1.897838717590028e-08, "loss": 0.0236, "num_tokens": 71214613.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4423161745071411, "sampling/importance_sampling_ratio/mean": 0.9996387958526611, "sampling/importance_sampling_ratio/min": 0.5998695492744446, "sampling/sampling_logp_difference/max": 0.5110430717468262, "sampling/sampling_logp_difference/mean": 0.015818282961845398, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 189.671875, "completions/mean_terminated_length": 189.671875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3822367191314697, "epoch": 2.7683823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.01981045146895738, "kl": 0.02560245618224144, "learning_rate": 1.8784469215719077e-08, "loss": 0.0003, "num_tokens": 71245856.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4124040603637695, "sampling/importance_sampling_ratio/mean": 1.0001107454299927, "sampling/importance_sampling_ratio/min": 0.7300852537155151, "sampling/sampling_logp_difference/max": 0.3452932834625244, "sampling/sampling_logp_difference/mean": 0.015097684226930141, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 222.78125, "completions/mean_terminated_length": 222.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.38841933012008667, "epoch": 2.769607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.7455509339217324, "kl": 0.036847107112407684, "learning_rate": 1.8591528092579524e-08, "loss": -0.0152, "num_tokens": 71277522.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.47059965133667, "sampling/importance_sampling_ratio/mean": 0.9993435144424438, "sampling/importance_sampling_ratio/min": 0.6143800616264343, "sampling/sampling_logp_difference/max": 0.48714160919189453, "sampling/sampling_logp_difference/mean": 0.014544347301125526, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.43171802163124084, "epoch": 2.7708333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.018514479874583628, "kl": 0.03312084823846817, "learning_rate": 1.8399564198139707e-08, "loss": 0.0003, "num_tokens": 71313386.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6046829223632812, "sampling/importance_sampling_ratio/mean": 1.0005278587341309, "sampling/importance_sampling_ratio/min": 0.6150000691413879, "sampling/sampling_logp_difference/max": 0.4861328601837158, "sampling/sampling_logp_difference/mean": 0.015385524369776249, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28259339928627014, "epoch": 2.7720588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.013630810725824267, "kl": 0.02327803522348404, "learning_rate": 1.8208577922074308e-08, "loss": 0.0002, "num_tokens": 71342894.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3212229013442993, "sampling/importance_sampling_ratio/mean": 1.0000630617141724, "sampling/importance_sampling_ratio/min": 0.6086598634719849, "sampling/sampling_logp_difference/max": 0.49649572372436523, "sampling/sampling_logp_difference/mean": 0.01166549976915121, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 198.34375, "completions/mean_terminated_length": 198.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3560211956501007, "epoch": 2.7732843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.8075662995223074, "kl": 0.024799000471830368, "learning_rate": 1.8018569652073378e-08, "loss": -0.034, "num_tokens": 71377780.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5474638938903809, "sampling/importance_sampling_ratio/mean": 0.9999863505363464, "sampling/importance_sampling_ratio/min": 0.688747227191925, "sampling/sampling_logp_difference/max": 0.436617374420166, "sampling/sampling_logp_difference/mean": 0.014782153069972992, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 173.390625, "completions/mean_terminated_length": 173.390625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3286271095275879, "epoch": 2.7745098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.016444057765425384, "kl": 0.01911253109574318, "learning_rate": 1.7829539773841608e-08, "loss": 0.0002, "num_tokens": 71403821.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.409652829170227, "sampling/importance_sampling_ratio/mean": 1.0003719329833984, "sampling/importance_sampling_ratio/min": 0.4979653060436249, "sampling/sampling_logp_difference/max": 0.6972248554229736, "sampling/sampling_logp_difference/mean": 0.013749302364885807, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 270.46875, "completions/mean_terminated_length": 270.46875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.41420799493789673, "epoch": 2.775735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 0.7681025216946435, "kl": 0.023575209081172943, "learning_rate": 1.7641488671097606e-08, "loss": 0.0032, "num_tokens": 71439979.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.5751935243606567, "sampling/importance_sampling_ratio/mean": 1.0003597736358643, "sampling/importance_sampling_ratio/min": 0.6245823502540588, "sampling/sampling_logp_difference/max": 0.4706721305847168, "sampling/sampling_logp_difference/mean": 0.014989707618951797, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 196.203125, "completions/mean_terminated_length": 196.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3639991581439972, "epoch": 2.7769607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.9776378086283738, "kl": 0.027560634538531303, "learning_rate": 1.745441672557335e-08, "loss": -0.0501, "num_tokens": 71469320.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.8810596466064453, "sampling/importance_sampling_ratio/mean": 0.9995264410972595, "sampling/importance_sampling_ratio/min": 0.6577244997024536, "sampling/sampling_logp_difference/max": 0.6318352222442627, "sampling/sampling_logp_difference/mean": 0.014080442488193512, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 200.5625, "completions/mean_terminated_length": 200.5625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3483728766441345, "epoch": 2.778186274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.7482550607243168, "kl": 0.02732473611831665, "learning_rate": 1.7268324317012973e-08, "loss": -0.0, "num_tokens": 71505644.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5977916717529297, "sampling/importance_sampling_ratio/mean": 1.0005908012390137, "sampling/importance_sampling_ratio/min": 0.6355277299880981, "sampling/sampling_logp_difference/max": 0.46862244606018066, "sampling/sampling_logp_difference/mean": 0.01260870136320591, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3717099130153656, "epoch": 2.7794117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.01416227546876349, "kl": 0.030278926715254784, "learning_rate": 1.7083211823172184e-08, "loss": 0.0003, "num_tokens": 71548436.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.6404244899749756, "sampling/importance_sampling_ratio/mean": 0.9999901056289673, "sampling/importance_sampling_ratio/min": 0.6045262217521667, "sampling/sampling_logp_difference/max": 0.5033102035522461, "sampling/sampling_logp_difference/mean": 0.014114852994680405, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 161.03125, "completions/mean_terminated_length": 161.03125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3516573905944824, "epoch": 2.780637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.019098955354129955, "kl": 0.026305729523301125, "learning_rate": 1.6899079619817792e-08, "loss": 0.0003, "num_tokens": 71579350.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4534450769424438, "sampling/importance_sampling_ratio/mean": 0.9997409582138062, "sampling/importance_sampling_ratio/min": 0.5638967752456665, "sampling/sampling_logp_difference/max": 0.5728840827941895, "sampling/sampling_logp_difference/mean": 0.015011224895715714, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 202.921875, "completions/mean_terminated_length": 202.921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3277509808540344, "epoch": 2.781862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.011914898594531468, "kl": 0.016980361193418503, "learning_rate": 1.6715928080726415e-08, "loss": 0.0002, "num_tokens": 71606769.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4952607154846191, "sampling/importance_sampling_ratio/mean": 0.9994877576828003, "sampling/importance_sampling_ratio/min": 0.6171972751617432, "sampling/sampling_logp_difference/max": 0.48256659507751465, "sampling/sampling_logp_difference/mean": 0.013675330206751823, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 229.046875, "completions/mean_terminated_length": 229.046875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.33403104543685913, "epoch": 2.7830882352941178, "frac_reward_zero_std": 1.0, "grad_norm": 0.014551090533169688, "kl": 0.022232726216316223, "learning_rate": 1.653375757768405e-08, "loss": 0.0002, "num_tokens": 71643844.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5311309099197388, "sampling/importance_sampling_ratio/mean": 0.9995133280754089, "sampling/importance_sampling_ratio/min": 0.6396133303642273, "sampling/sampling_logp_difference/max": 0.4468914866447449, "sampling/sampling_logp_difference/mean": 0.013440671376883984, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.34039393067359924, "epoch": 2.784313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.02489054595610817, "kl": 0.030884843319654465, "learning_rate": 1.6352568480485275e-08, "loss": 0.0003, "num_tokens": 71677360.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5885577201843262, "sampling/importance_sampling_ratio/mean": 0.9998325705528259, "sampling/importance_sampling_ratio/min": 0.6298431158065796, "sampling/sampling_logp_difference/max": 0.4628264904022217, "sampling/sampling_logp_difference/mean": 0.014835665933787823, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 227.09375, "completions/mean_terminated_length": 227.09375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.41055136919021606, "epoch": 2.7855392156862746, "frac_reward_zero_std": 1.0, "grad_norm": 0.022139890170848206, "kl": 0.022946510463953018, "learning_rate": 1.6172361156932547e-08, "loss": 0.0002, "num_tokens": 71711094.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004057884216309, "sampling/importance_sampling_ratio/min": 0.5827050805091858, "sampling/sampling_logp_difference/max": 0.7382068634033203, "sampling/sampling_logp_difference/mean": 0.015440518967807293, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 185.46875, "completions/mean_terminated_length": 185.46875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3047197163105011, "epoch": 2.786764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.017871062297091952, "kl": 0.024374118074774742, "learning_rate": 1.5993135972835303e-08, "loss": 0.0002, "num_tokens": 71736532.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4740394353866577, "sampling/importance_sampling_ratio/mean": 0.999732255935669, "sampling/importance_sampling_ratio/min": 0.36422932147979736, "sampling/sampling_logp_difference/max": 1.0099716186523438, "sampling/sampling_logp_difference/mean": 0.013879004865884781, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 191.78125, "completions/mean_terminated_length": 191.78125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3157747983932495, "epoch": 2.7879901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.01774562023969574, "kl": 0.021239880472421646, "learning_rate": 1.581489329200919e-08, "loss": 0.0002, "num_tokens": 71766326.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 0.9995769262313843, "sampling/importance_sampling_ratio/min": 0.6960172653198242, "sampling/sampling_logp_difference/max": 0.49199533462524414, "sampling/sampling_logp_difference/mean": 0.013254116289317608, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 200.3125, "completions/mean_terminated_length": 200.3125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3812054991722107, "epoch": 2.7892156862745097, "frac_reward_zero_std": 0.75, "grad_norm": 0.760931902743617, "kl": 0.10009460896253586, "learning_rate": 1.5637633476275724e-08, "loss": -0.0047, "num_tokens": 71795178.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.2994608879089355, "sampling/importance_sampling_ratio/mean": 1.0003132820129395, "sampling/importance_sampling_ratio/min": 0.62801194190979, "sampling/sampling_logp_difference/max": 0.4651961326599121, "sampling/sampling_logp_difference/mean": 0.015429170802235603, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 186.578125, "completions/mean_terminated_length": 186.578125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2862929105758667, "epoch": 2.7904411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.7032311059803306, "kl": 0.02835479937493801, "learning_rate": 1.5461356885461075e-08, "loss": 0.0011, "num_tokens": 71821439.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.590358853340149, "sampling/importance_sampling_ratio/mean": 0.9992215037345886, "sampling/importance_sampling_ratio/min": 0.6701014041900635, "sampling/sampling_logp_difference/max": 0.4639596939086914, "sampling/sampling_logp_difference/mean": 0.011490806937217712, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 251.4375, "completions/mean_terminated_length": 251.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4205988347530365, "epoch": 2.7916666666666665, "frac_reward_zero_std": 0.75, "grad_norm": 0.6352273994365715, "kl": 0.04728353023529053, "learning_rate": 1.528606387739545e-08, "loss": 0.0001, "num_tokens": 71862955.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7753299474716187, "sampling/importance_sampling_ratio/mean": 0.9998862147331238, "sampling/importance_sampling_ratio/min": 0.6273989677429199, "sampling/sampling_logp_difference/max": 0.5739863514900208, "sampling/sampling_logp_difference/mean": 0.014236108399927616, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 216.28125, "completions/mean_terminated_length": 216.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3384827673435211, "epoch": 2.792892156862745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0139936031800226, "kl": 0.021461669355630875, "learning_rate": 1.5111754807912546e-08, "loss": 0.0002, "num_tokens": 71892045.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5768814086914062, "sampling/importance_sampling_ratio/mean": 1.000150203704834, "sampling/importance_sampling_ratio/min": 0.6033788919448853, "sampling/sampling_logp_difference/max": 0.5052099227905273, "sampling/sampling_logp_difference/mean": 0.014072628691792488, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 206.703125, "completions/mean_terminated_length": 206.703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.26644182205200195, "epoch": 2.7941176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.013526680091452808, "kl": 0.016012419015169144, "learning_rate": 1.493843003084888e-08, "loss": 0.0002, "num_tokens": 71927162.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.403918981552124, "sampling/importance_sampling_ratio/mean": 0.9997563362121582, "sampling/importance_sampling_ratio/min": 0.620281457901001, "sampling/sampling_logp_difference/max": 0.4775819778442383, "sampling/sampling_logp_difference/mean": 0.011524304747581482, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.34191954135894775, "epoch": 2.795343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.02604340860845786, "kl": 0.029353775084018707, "learning_rate": 1.4766089898042677e-08, "loss": 0.0003, "num_tokens": 71953890.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.9246152639389038, "sampling/importance_sampling_ratio/mean": 1.0005629062652588, "sampling/importance_sampling_ratio/min": 0.6502727270126343, "sampling/sampling_logp_difference/max": 0.6547261476516724, "sampling/sampling_logp_difference/mean": 0.014699749648571014, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3046913743019104, "epoch": 2.7965686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.014346661295387824, "kl": 0.018736181780695915, "learning_rate": 1.4594734759333482e-08, "loss": 0.0002, "num_tokens": 71983954.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6348410844802856, "sampling/importance_sampling_ratio/mean": 1.0004372596740723, "sampling/importance_sampling_ratio/min": 0.6498619914054871, "sampling/sampling_logp_difference/max": 0.4915456771850586, "sampling/sampling_logp_difference/mean": 0.013584461063146591, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 166.15625, "completions/mean_terminated_length": 166.15625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3658546507358551, "epoch": 2.797794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.015578315080727533, "kl": 0.022142380475997925, "learning_rate": 1.4424364962561386e-08, "loss": 0.0002, "num_tokens": 72012908.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4352006912231445, "sampling/importance_sampling_ratio/mean": 1.000298261642456, "sampling/importance_sampling_ratio/min": 0.6274946928024292, "sampling/sampling_logp_difference/max": 0.4660201072692871, "sampling/sampling_logp_difference/mean": 0.014988088980317116, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 172.84375, "completions/mean_terminated_length": 172.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.39948737621307373, "epoch": 2.799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.9562705683755706, "kl": 0.09000791609287262, "learning_rate": 1.4254980853566246e-08, "loss": 0.0205, "num_tokens": 72042242.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5422638654708862, "sampling/importance_sampling_ratio/mean": 0.9992375373840332, "sampling/importance_sampling_ratio/min": 0.6525204181671143, "sampling/sampling_logp_difference/max": 0.43325138092041016, "sampling/sampling_logp_difference/mean": 0.0161836426705122, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 170.71875, "completions/mean_terminated_length": 170.71875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3471698760986328, "epoch": 2.8002450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.021503997632601194, "kl": 0.028218526393175125, "learning_rate": 1.4086582776187239e-08, "loss": 0.0003, "num_tokens": 72073456.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4221441745758057, "sampling/importance_sampling_ratio/mean": 1.0006734132766724, "sampling/importance_sampling_ratio/min": 0.6298839449882507, "sampling/sampling_logp_difference/max": 0.4622197151184082, "sampling/sampling_logp_difference/mean": 0.015107318758964539, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.44408896565437317, "epoch": 2.8014705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.7316867389262184, "kl": 0.028132587671279907, "learning_rate": 1.3919171072261537e-08, "loss": -0.0022, "num_tokens": 72109064.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5388634204864502, "sampling/importance_sampling_ratio/mean": 1.0005836486816406, "sampling/importance_sampling_ratio/min": 0.6987859010696411, "sampling/sampling_logp_difference/max": 0.4310441017150879, "sampling/sampling_logp_difference/mean": 0.015850011259317398, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3989129066467285, "epoch": 2.8026960784313726, "frac_reward_zero_std": 1.0, "grad_norm": 0.014688646078152737, "kl": 0.020441781729459763, "learning_rate": 1.3752746081624467e-08, "loss": 0.0002, "num_tokens": 72143688.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4659093618392944, "sampling/importance_sampling_ratio/mean": 0.9998365640640259, "sampling/importance_sampling_ratio/min": 0.6817200183868408, "sampling/sampling_logp_difference/max": 0.3831362724304199, "sampling/sampling_logp_difference/mean": 0.014877324923872948, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 197.734375, "completions/mean_terminated_length": 197.734375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.31073617935180664, "epoch": 2.803921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.012405867713131104, "kl": 0.019323110580444336, "learning_rate": 1.3587308142108178e-08, "loss": 0.0002, "num_tokens": 72174919.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4327930212020874, "sampling/importance_sampling_ratio/mean": 1.0001391172409058, "sampling/importance_sampling_ratio/min": 0.6468679308891296, "sampling/sampling_logp_difference/max": 0.43561315536499023, "sampling/sampling_logp_difference/mean": 0.013890000060200691, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 175.15625, "completions/mean_terminated_length": 175.15625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3362789750099182, "epoch": 2.8051470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.017843234505621534, "kl": 0.024533363059163094, "learning_rate": 1.3422857589541148e-08, "loss": 0.0002, "num_tokens": 72200481.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.433450698852539, "sampling/importance_sampling_ratio/mean": 1.0008256435394287, "sampling/importance_sampling_ratio/min": 0.650992751121521, "sampling/sampling_logp_difference/max": 0.429256796836853, "sampling/sampling_logp_difference/mean": 0.014599323272705078, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.36659950017929077, "epoch": 2.806372549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.7103152574303739, "kl": 0.06570275127887726, "learning_rate": 1.3259394757747677e-08, "loss": 0.0083, "num_tokens": 72228033.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5661461353302002, "sampling/importance_sampling_ratio/mean": 0.9998388290405273, "sampling/importance_sampling_ratio/min": 0.6300721168518066, "sampling/sampling_logp_difference/max": 0.46192097663879395, "sampling/sampling_logp_difference/mean": 0.015525387600064278, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 228.828125, "completions/mean_terminated_length": 228.828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3071904182434082, "epoch": 2.8075980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.016346678045750535, "kl": 0.020016958937048912, "learning_rate": 1.3096919978546838e-08, "loss": 0.0002, "num_tokens": 72257862.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003342628479004, "sampling/importance_sampling_ratio/min": 0.6487526297569275, "sampling/sampling_logp_difference/max": 0.9694099426269531, "sampling/sampling_logp_difference/mean": 0.013793135061860085, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 200.453125, "completions/mean_terminated_length": 200.453125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3286440074443817, "epoch": 2.8088235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.013722676646069513, "kl": 0.019434725865721703, "learning_rate": 1.2935433581752365e-08, "loss": 0.0002, "num_tokens": 72287827.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.533599853515625, "sampling/importance_sampling_ratio/mean": 0.9998020529747009, "sampling/importance_sampling_ratio/min": 0.6816995143890381, "sampling/sampling_logp_difference/max": 0.42761778831481934, "sampling/sampling_logp_difference/mean": 0.013315289281308651, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 256.859375, "completions/mean_terminated_length": 256.859375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3069850206375122, "epoch": 2.810049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.013881546272108672, "kl": 0.017193015664815903, "learning_rate": 1.2774935895171091e-08, "loss": 0.0002, "num_tokens": 72320202.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744256973266602, "sampling/importance_sampling_ratio/mean": 1.0000576972961426, "sampling/importance_sampling_ratio/min": 0.6482973694801331, "sampling/sampling_logp_difference/max": 0.4538905620574951, "sampling/sampling_logp_difference/mean": 0.011956065893173218, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 200.953125, "completions/mean_terminated_length": 200.953125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.25608932971954346, "epoch": 2.811274509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.016591961813083148, "kl": 0.025756172835826874, "learning_rate": 1.2615427244603405e-08, "loss": 0.0003, "num_tokens": 72347927.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5533801317214966, "sampling/importance_sampling_ratio/mean": 1.000268578529358, "sampling/importance_sampling_ratio/min": 0.6556463241577148, "sampling/sampling_logp_difference/max": 0.4404332637786865, "sampling/sampling_logp_difference/mean": 0.010999459773302078, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 218.984375, "completions/mean_terminated_length": 218.984375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3019634187221527, "epoch": 2.8125, "frac_reward_zero_std": 1.0, "grad_norm": 0.01433802192062887, "kl": 0.019851867109537125, "learning_rate": 1.2456907953841633e-08, "loss": 0.0002, "num_tokens": 72378054.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.405820369720459, "sampling/importance_sampling_ratio/mean": 0.9999945163726807, "sampling/importance_sampling_ratio/min": 0.6096331477165222, "sampling/sampling_logp_difference/max": 0.49489784240722656, "sampling/sampling_logp_difference/mean": 0.012230746448040009, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 214.59375, "completions/mean_terminated_length": 214.59375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3920062482357025, "epoch": 2.813725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 1.1001250463806667, "kl": 0.033399369567632675, "learning_rate": 1.2299378344669986e-08, "loss": -0.0648, "num_tokens": 72406684.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4886319637298584, "sampling/importance_sampling_ratio/mean": 0.9996265769004822, "sampling/importance_sampling_ratio/min": 0.5238063335418701, "sampling/sampling_logp_difference/max": 0.6466332674026489, "sampling/sampling_logp_difference/mean": 0.014440304599702358, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 252.5625, "completions/mean_terminated_length": 252.5625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.40310561656951904, "epoch": 2.814950980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.01572991801480214, "kl": 0.023704007267951965, "learning_rate": 1.2142838736863559e-08, "loss": 0.0002, "num_tokens": 72438160.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.608891487121582, "sampling/importance_sampling_ratio/mean": 1.000126838684082, "sampling/importance_sampling_ratio/min": 0.4627833664417267, "sampling/sampling_logp_difference/max": 0.7704962491989136, "sampling/sampling_logp_difference/mean": 0.015635818243026733, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 166.4375, "completions/mean_terminated_length": 166.4375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.4271998405456543, "epoch": 2.8161764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 1.0289753184079062, "kl": 0.0330132395029068, "learning_rate": 1.1987289448187777e-08, "loss": -0.0284, "num_tokens": 72466684.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.5406886339187622, "sampling/importance_sampling_ratio/mean": 0.9999655485153198, "sampling/importance_sampling_ratio/min": 0.7300183773040771, "sampling/sampling_logp_difference/max": 0.43222951889038086, "sampling/sampling_logp_difference/mean": 0.01558359619230032, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 217.3125, "completions/mean_terminated_length": 217.3125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.4008779525756836, "epoch": 2.8174019607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.027301807947582572, "kl": 0.027768369764089584, "learning_rate": 1.183273079439795e-08, "loss": 0.0003, "num_tokens": 72503008.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5744314193725586, "sampling/importance_sampling_ratio/mean": 1.000143051147461, "sampling/importance_sampling_ratio/min": 0.5038020610809326, "sampling/sampling_logp_difference/max": 0.6855719089508057, "sampling/sampling_logp_difference/mean": 0.015518108382821083, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 197.578125, "completions/mean_terminated_length": 197.578125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.4736689329147339, "epoch": 2.818627450980392, "frac_reward_zero_std": 0.75, "grad_norm": 0.9397093439566856, "kl": 0.03798678144812584, "learning_rate": 1.167916308923822e-08, "loss": -0.0049, "num_tokens": 72538037.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4995191097259521, "sampling/importance_sampling_ratio/mean": 0.9994603395462036, "sampling/importance_sampling_ratio/min": 0.6323956847190857, "sampling/sampling_logp_difference/max": 0.4582400321960449, "sampling/sampling_logp_difference/mean": 0.016999687999486923, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 232.640625, "completions/mean_terminated_length": 232.640625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.27715107798576355, "epoch": 2.8198529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03962575330413272, "kl": 0.025751516222953796, "learning_rate": 1.152658664444145e-08, "loss": 0.0002, "num_tokens": 72571502.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.434734582901001, "sampling/importance_sampling_ratio/mean": 1.0000114440917969, "sampling/importance_sampling_ratio/min": 0.6264632344245911, "sampling/sampling_logp_difference/max": 0.4676651954650879, "sampling/sampling_logp_difference/mean": 0.011388463899493217, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 219.28125, "completions/mean_terminated_length": 219.28125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4073230028152466, "epoch": 2.821078431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0163658782529671, "kl": 0.024018779397010803, "learning_rate": 1.1375001769727999e-08, "loss": 0.0002, "num_tokens": 72604784.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6203665733337402, "sampling/importance_sampling_ratio/mean": 0.9998045563697815, "sampling/importance_sampling_ratio/min": 0.6656513810157776, "sampling/sampling_logp_difference/max": 0.4826524257659912, "sampling/sampling_logp_difference/mean": 0.015417106449604034, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 220.4375, "completions/mean_terminated_length": 220.4375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.41577786207199097, "epoch": 2.8223039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7588138821700442, "kl": 0.047006770968437195, "learning_rate": 1.1224408772805671e-08, "loss": 0.0089, "num_tokens": 72638092.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.9848963022232056, "sampling/importance_sampling_ratio/mean": 0.9997138977050781, "sampling/importance_sampling_ratio/min": 0.6137931942939758, "sampling/sampling_logp_difference/max": 0.6855666637420654, "sampling/sampling_logp_difference/mean": 0.015783226117491722, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 215.28125, "completions/mean_terminated_length": 215.28125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.40591156482696533, "epoch": 2.8235294117647056, "frac_reward_zero_std": 0.5, "grad_norm": 1.4637659212274494, "kl": 0.08024704456329346, "learning_rate": 1.1074807959368715e-08, "loss": 0.043, "num_tokens": 72668238.0, "reward": 0.78125, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.411345362663269, "sampling/importance_sampling_ratio/mean": 1.0003767013549805, "sampling/importance_sampling_ratio/min": 0.6954315900802612, "sampling/sampling_logp_difference/max": 0.363222599029541, "sampling/sampling_logp_difference/mean": 0.014792080037295818, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3982296884059906, "epoch": 2.8247549019607843, "frac_reward_zero_std": 0.5, "grad_norm": 1.2458175002931993, "kl": 0.027796974405646324, "learning_rate": 1.0926199633097154e-08, "loss": 0.0093, "num_tokens": 72699442.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4560644626617432, "sampling/importance_sampling_ratio/mean": 0.9998251795768738, "sampling/importance_sampling_ratio/min": 0.5910095572471619, "sampling/sampling_logp_difference/max": 0.5259230136871338, "sampling/sampling_logp_difference/mean": 0.01536161731928587, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 174.234375, "completions/mean_terminated_length": 174.234375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.42545390129089355, "epoch": 2.825980392156863, "frac_reward_zero_std": 0.75, "grad_norm": 1.1544362220501005, "kl": 0.04277203232049942, "learning_rate": 1.0778584095656685e-08, "loss": -0.0173, "num_tokens": 72724449.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5674940347671509, "sampling/importance_sampling_ratio/mean": 1.0002422332763672, "sampling/importance_sampling_ratio/min": 0.6310010552406311, "sampling/sampling_logp_difference/max": 0.4604477882385254, "sampling/sampling_logp_difference/mean": 0.017049824818968773, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 234.296875, "completions/mean_terminated_length": 234.296875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.502498209476471, "epoch": 2.827205882352941, "frac_reward_zero_std": 0.5, "grad_norm": 1.0590844835191695, "kl": 0.043628957122564316, "learning_rate": 1.0631961646697384e-08, "loss": 0.0025, "num_tokens": 72762852.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.8157446384429932, "sampling/importance_sampling_ratio/mean": 1.0002353191375732, "sampling/importance_sampling_ratio/min": 0.6138596534729004, "sampling/sampling_logp_difference/max": 0.5964956283569336, "sampling/sampling_logp_difference/mean": 0.016772117465734482, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 232.6875, "completions/mean_terminated_length": 232.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3923417329788208, "epoch": 2.8284313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.8466391252933121, "kl": 0.029825102537870407, "learning_rate": 1.0486332583853564e-08, "loss": 0.0663, "num_tokens": 72799488.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.555580496788025, "sampling/importance_sampling_ratio/mean": 0.999732494354248, "sampling/importance_sampling_ratio/min": 0.7178300619125366, "sampling/sampling_logp_difference/max": 0.4418487548828125, "sampling/sampling_logp_difference/mean": 0.013146606273949146, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 164.828125, "completions/mean_terminated_length": 164.828125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3960435390472412, "epoch": 2.829656862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.02599589169385834, "kl": 0.03480679541826248, "learning_rate": 1.0341697202742971e-08, "loss": 0.0004, "num_tokens": 72826741.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4337533712387085, "sampling/importance_sampling_ratio/mean": 0.9996919631958008, "sampling/importance_sampling_ratio/min": 0.690007209777832, "sampling/sampling_logp_difference/max": 0.37105321884155273, "sampling/sampling_logp_difference/mean": 0.01564282365143299, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.37923213839530945, "epoch": 2.8308823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.018401309831405036, "kl": 0.020857226103544235, "learning_rate": 1.0198055796966253e-08, "loss": 0.0002, "num_tokens": 72866397.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5018757581710815, "sampling/importance_sampling_ratio/mean": 1.0000402927398682, "sampling/importance_sampling_ratio/min": 0.6099699139595032, "sampling/sampling_logp_difference/max": 0.49434566497802734, "sampling/sampling_logp_difference/mean": 0.014194940216839314, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 179.34375, "completions/mean_terminated_length": 179.34375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4174288511276245, "epoch": 2.832107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.019770007295590857, "kl": 0.024596508592367172, "learning_rate": 1.0055408658106446e-08, "loss": 0.0002, "num_tokens": 72895619.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5415550470352173, "sampling/importance_sampling_ratio/mean": 1.0004100799560547, "sampling/importance_sampling_ratio/min": 0.6368833780288696, "sampling/sampling_logp_difference/max": 0.4511687755584717, "sampling/sampling_logp_difference/mean": 0.01626231148838997, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 212.96875, "completions/mean_terminated_length": 212.96875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3599066138267517, "epoch": 2.8333333333333335, "frac_reward_zero_std": 1.0, "grad_norm": 0.04102920645319019, "kl": 0.030512923374772072, "learning_rate": 9.913756075728086e-09, "loss": 0.0003, "num_tokens": 72926705.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00050950050354, "sampling/importance_sampling_ratio/min": 0.3303156793117523, "sampling/sampling_logp_difference/max": 1.1077064275741577, "sampling/sampling_logp_difference/mean": 0.015415752306580544, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 214.46875, "completions/mean_terminated_length": 214.46875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.41638386249542236, "epoch": 2.8345588235294117, "frac_reward_zero_std": 0.25, "grad_norm": 1.317533006833584, "kl": 0.038041263818740845, "learning_rate": 9.77309833737705e-09, "loss": 0.0006, "num_tokens": 72959503.0, "reward": 0.0625, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4607181549072266, "sampling/importance_sampling_ratio/mean": 1.000251293182373, "sampling/importance_sampling_ratio/min": 0.6063253879547119, "sampling/sampling_logp_difference/max": 0.5003385543823242, "sampling/sampling_logp_difference/mean": 0.015667861327528954, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 224.671875, "completions/mean_terminated_length": 224.671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3578198552131653, "epoch": 2.8357843137254903, "frac_reward_zero_std": 1.0, "grad_norm": 0.01726464198667003, "kl": 0.024974340572953224, "learning_rate": 9.633435728579553e-09, "loss": 0.0002, "num_tokens": 72999402.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5950409173965454, "sampling/importance_sampling_ratio/mean": 0.9995691180229187, "sampling/importance_sampling_ratio/min": 0.5801035165786743, "sampling/sampling_logp_difference/max": 0.544548749923706, "sampling/sampling_logp_difference/mean": 0.014314240776002407, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 179.203125, "completions/mean_terminated_length": 179.203125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3301982879638672, "epoch": 2.8370098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.24249387395368324, "kl": 0.030474865809082985, "learning_rate": 9.494768532841868e-09, "loss": 0.0003, "num_tokens": 73026535.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4057986736297607, "sampling/importance_sampling_ratio/mean": 0.999641478061676, "sampling/importance_sampling_ratio/min": 0.01303944829851389, "sampling/sampling_logp_difference/max": 4.339776039123535, "sampling/sampling_logp_difference/mean": 0.014355506747961044, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 242.53125, "completions/mean_terminated_length": 242.53125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4580194354057312, "epoch": 2.838235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.022540072788519563, "kl": 0.0541636198759079, "learning_rate": 9.357097031649664e-09, "loss": 0.0005, "num_tokens": 73065625.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4006434679031372, "sampling/importance_sampling_ratio/mean": 1.000105619430542, "sampling/importance_sampling_ratio/min": 0.6771495342254639, "sampling/sampling_logp_difference/max": 0.38986313343048096, "sampling/sampling_logp_difference/mean": 0.01546635851264, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 231.734375, "completions/mean_terminated_length": 231.734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.4251118302345276, "epoch": 2.8394607843137254, "frac_reward_zero_std": 0.75, "grad_norm": 0.7661100233961052, "kl": 0.04458808898925781, "learning_rate": 9.22042150446728e-09, "loss": -0.0047, "num_tokens": 73100952.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.5140609741210938, "sampling/importance_sampling_ratio/mean": 1.0003342628479004, "sampling/importance_sampling_ratio/min": 0.6603867411613464, "sampling/sampling_logp_difference/max": 0.4149296283721924, "sampling/sampling_logp_difference/mean": 0.015295105054974556, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 173.84375, "completions/mean_terminated_length": 173.84375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.29564523696899414, "epoch": 2.840686274509804, "frac_reward_zero_std": 0.75, "grad_norm": 0.9336704230419721, "kl": 0.02050560712814331, "learning_rate": 9.084742228737564e-09, "loss": 0.0003, "num_tokens": 73128046.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4100669622421265, "sampling/importance_sampling_ratio/mean": 1.0000941753387451, "sampling/importance_sampling_ratio/min": 0.6374108195304871, "sampling/sampling_logp_difference/max": 0.45034098625183105, "sampling/sampling_logp_difference/mean": 0.012831066735088825, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 215.828125, "completions/mean_terminated_length": 215.828125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3264046311378479, "epoch": 2.8419117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.015784236326885567, "kl": 0.022641684859991074, "learning_rate": 8.95005947988059e-09, "loss": 0.0002, "num_tokens": 73162307.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3977603912353516, "sampling/importance_sampling_ratio/mean": 0.9998221397399902, "sampling/importance_sampling_ratio/min": 0.6920250058174133, "sampling/sampling_logp_difference/max": 0.36813318729400635, "sampling/sampling_logp_difference/mean": 0.012915970757603645, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 251.46875, "completions/mean_terminated_length": 251.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3470064401626587, "epoch": 2.843137254901961, "frac_reward_zero_std": 0.75, "grad_norm": 0.8172689514942891, "kl": 0.023917749524116516, "learning_rate": 8.816373531293941e-09, "loss": -0.03, "num_tokens": 73206097.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4835994243621826, "sampling/importance_sampling_ratio/mean": 1.0001312494277954, "sampling/importance_sampling_ratio/min": 0.4880845844745636, "sampling/sampling_logp_difference/max": 0.7172665596008301, "sampling/sampling_logp_difference/mean": 0.013287542387843132, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 207.703125, "completions/mean_terminated_length": 207.703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.42378494143486023, "epoch": 2.844362745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.035807953869066766, "kl": 0.060972969979047775, "learning_rate": 8.683684654351597e-09, "loss": 0.0005, "num_tokens": 73237694.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4630006551742554, "sampling/importance_sampling_ratio/mean": 1.000170111656189, "sampling/importance_sampling_ratio/min": 0.6115177869796753, "sampling/sampling_logp_difference/max": 0.49181127548217773, "sampling/sampling_logp_difference/mean": 0.01597459241747856, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 173.15625, "completions/mean_terminated_length": 173.15625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.32589611411094666, "epoch": 2.8455882352941178, "frac_reward_zero_std": 0.75, "grad_norm": 0.8919938548263948, "kl": 0.036309003829956055, "learning_rate": 8.551993118403656e-09, "loss": -0.0197, "num_tokens": 73271976.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.350554347038269, "sampling/importance_sampling_ratio/mean": 0.9998493790626526, "sampling/importance_sampling_ratio/min": 0.7448745965957642, "sampling/sampling_logp_difference/max": 0.30051517486572266, "sampling/sampling_logp_difference/mean": 0.012146501801908016, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 223.578125, "completions/mean_terminated_length": 223.578125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3561001420021057, "epoch": 2.846813725490196, "frac_reward_zero_std": 0.75, "grad_norm": 0.8875842059271485, "kl": 0.026764214038848877, "learning_rate": 8.4212991907755e-09, "loss": 0.0014, "num_tokens": 73305693.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.6464452743530273, "sampling/importance_sampling_ratio/mean": 1.000075101852417, "sampling/importance_sampling_ratio/min": 0.7136602997779846, "sampling/sampling_logp_difference/max": 0.49861860275268555, "sampling/sampling_logp_difference/mean": 0.013426331803202629, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.4191769063472748, "epoch": 2.8480392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 1.311661427454457, "kl": 0.048060342669487, "learning_rate": 8.291603136767521e-09, "loss": 0.0336, "num_tokens": 73333221.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4079670906066895, "sampling/importance_sampling_ratio/mean": 1.0000182390213013, "sampling/importance_sampling_ratio/min": 0.7247595191001892, "sampling/sampling_logp_difference/max": 0.3421468734741211, "sampling/sampling_logp_difference/mean": 0.015923064202070236, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30206298828125, "epoch": 2.849264705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.019728183790055673, "kl": 0.02351086027920246, "learning_rate": 8.16290521965457e-09, "loss": 0.0002, "num_tokens": 73357741.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5792306661605835, "sampling/importance_sampling_ratio/mean": 1.0001418590545654, "sampling/importance_sampling_ratio/min": 0.6430962681770325, "sampling/sampling_logp_difference/max": 0.4569377899169922, "sampling/sampling_logp_difference/mean": 0.013828947208821774, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 158.140625, "completions/mean_terminated_length": 158.140625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.3143717646598816, "epoch": 2.8504901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 1.1638472881677286, "kl": 0.0942501351237297, "learning_rate": 8.035205700685165e-09, "loss": 0.0258, "num_tokens": 73384550.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4685288667678833, "sampling/importance_sampling_ratio/mean": 0.9998546242713928, "sampling/importance_sampling_ratio/min": 0.6580463647842407, "sampling/sampling_logp_difference/max": 0.418479859828949, "sampling/sampling_logp_difference/mean": 0.014786609448492527, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3219773769378662, "epoch": 2.8517156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.02119687951493217, "kl": 0.03144240006804466, "learning_rate": 7.908504839081342e-09, "loss": 0.0003, "num_tokens": 73414718.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4369810819625854, "sampling/importance_sampling_ratio/mean": 0.9997972846031189, "sampling/importance_sampling_ratio/min": 0.6411973237991333, "sampling/sampling_logp_difference/max": 0.4444180727005005, "sampling/sampling_logp_difference/mean": 0.013356061652302742, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 173.53125, "completions/mean_terminated_length": 173.53125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3775599002838135, "epoch": 2.8529411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 1.0575776332387867, "kl": 0.03039197437465191, "learning_rate": 7.7828028920377e-09, "loss": 0.0227, "num_tokens": 73447600.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.290131688117981, "sampling/importance_sampling_ratio/mean": 0.9999251961708069, "sampling/importance_sampling_ratio/min": 0.4140039384365082, "sampling/sampling_logp_difference/max": 0.8818798065185547, "sampling/sampling_logp_difference/mean": 0.014906775206327438, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.34313100576400757, "epoch": 2.8541666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.01749915624120215, "kl": 0.025798015296459198, "learning_rate": 7.658100114721344e-09, "loss": 0.0003, "num_tokens": 73477944.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.546187162399292, "sampling/importance_sampling_ratio/mean": 1.0003581047058105, "sampling/importance_sampling_ratio/min": 0.6673808693885803, "sampling/sampling_logp_difference/max": 0.4357919692993164, "sampling/sampling_logp_difference/mean": 0.013357100076973438, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 260.546875, "completions/mean_terminated_length": 260.546875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4380151033401489, "epoch": 2.855392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7354155385230504, "kl": 0.029750129207968712, "learning_rate": 7.534396760270956e-09, "loss": -0.0146, "num_tokens": 73516267.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.4922901391983032, "sampling/importance_sampling_ratio/mean": 1.000532627105713, "sampling/importance_sampling_ratio/min": 0.670768678188324, "sampling/sampling_logp_difference/max": 0.4003119468688965, "sampling/sampling_logp_difference/mean": 0.015685245394706726, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 227.65625, "completions/mean_terminated_length": 227.65625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.36915498971939087, "epoch": 2.8566176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.013620967683211502, "kl": 0.02155674248933792, "learning_rate": 7.411693079796499e-09, "loss": 0.0002, "num_tokens": 73548373.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5426087379455566, "sampling/importance_sampling_ratio/mean": 0.9998866319656372, "sampling/importance_sampling_ratio/min": 0.6737273335456848, "sampling/sampling_logp_difference/max": 0.4334750175476074, "sampling/sampling_logp_difference/mean": 0.013658540323376656, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 175.59375, "completions/mean_terminated_length": 175.59375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.43928152322769165, "epoch": 2.857843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.03764398945734357, "kl": 0.0492483451962471, "learning_rate": 7.289989322378731e-09, "loss": 0.0005, "num_tokens": 73577563.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4624541997909546, "sampling/importance_sampling_ratio/mean": 0.9998334646224976, "sampling/importance_sampling_ratio/min": 0.654796838760376, "sampling/sampling_logp_difference/max": 0.4234302043914795, "sampling/sampling_logp_difference/mean": 0.016958530992269516, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 221.46875, "completions/mean_terminated_length": 221.46875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.3196999430656433, "epoch": 2.8590686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.046409294879126074, "kl": 0.030097268521785736, "learning_rate": 7.169285735068531e-09, "loss": 0.0003, "num_tokens": 73611641.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4039427042007446, "sampling/importance_sampling_ratio/mean": 1.0001153945922852, "sampling/importance_sampling_ratio/min": 0.6454610824584961, "sampling/sampling_logp_difference/max": 0.4377903938293457, "sampling/sampling_logp_difference/mean": 0.01287408173084259, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 176.234375, "completions/mean_terminated_length": 176.234375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.42879849672317505, "epoch": 2.860294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.03494612726148045, "kl": 0.04784668609499931, "learning_rate": 7.049582562886513e-09, "loss": 0.0005, "num_tokens": 73636472.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3896585702896118, "sampling/importance_sampling_ratio/mean": 0.9997230768203735, "sampling/importance_sampling_ratio/min": 0.6368883848190308, "sampling/sampling_logp_difference/max": 0.45116090774536133, "sampling/sampling_logp_difference/mean": 0.01739996112883091, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 223.828125, "completions/mean_terminated_length": 223.828125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.40986114740371704, "epoch": 2.861519607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.015932745230181365, "kl": 0.029207296669483185, "learning_rate": 6.930880048822529e-09, "loss": 0.0003, "num_tokens": 73667837.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5521903038024902, "sampling/importance_sampling_ratio/mean": 1.0004559755325317, "sampling/importance_sampling_ratio/min": 0.6305375099182129, "sampling/sampling_logp_difference/max": 0.4611825942993164, "sampling/sampling_logp_difference/mean": 0.014823229983448982, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 169.953125, "completions/mean_terminated_length": 169.953125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.44827860593795776, "epoch": 2.8627450980392157, "frac_reward_zero_std": 0.5, "grad_norm": 1.538045583783523, "kl": 0.07150943577289581, "learning_rate": 6.813178433835221e-09, "loss": -0.0011, "num_tokens": 73690730.0, "reward": 0.34375, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.395931363105774, "sampling/importance_sampling_ratio/mean": 0.999872624874115, "sampling/importance_sampling_ratio/min": 0.6334940195083618, "sampling/sampling_logp_difference/max": 0.45650482177734375, "sampling/sampling_logp_difference/mean": 0.01682509109377861, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 177.953125, "completions/mean_terminated_length": 177.953125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.38354188203811646, "epoch": 2.8639705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.027594890610444674, "kl": 0.022542208433151245, "learning_rate": 6.696477956851354e-09, "loss": 0.0002, "num_tokens": 73722903.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6809242963790894, "sampling/importance_sampling_ratio/mean": 1.0005621910095215, "sampling/importance_sampling_ratio/min": 0.7015100717544556, "sampling/sampling_logp_difference/max": 0.5193438529968262, "sampling/sampling_logp_difference/mean": 0.01487412117421627, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3849712014198303, "epoch": 2.8651960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.3275793791177526, "kl": 0.05220656096935272, "learning_rate": 6.580778854765489e-09, "loss": 0.0074, "num_tokens": 73754547.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4683117866516113, "sampling/importance_sampling_ratio/mean": 0.9991458654403687, "sampling/importance_sampling_ratio/min": 0.6368657350540161, "sampling/sampling_logp_difference/max": 0.45119643211364746, "sampling/sampling_logp_difference/mean": 0.015320195816457272, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.42644044756889343, "epoch": 2.866421568627451, "frac_reward_zero_std": 0.5, "grad_norm": 1.0108201720755798, "kl": 0.026010312139987946, "learning_rate": 6.4660813624395905e-09, "loss": 0.1167, "num_tokens": 73793995.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3339488506317139, "sampling/importance_sampling_ratio/mean": 1.0000989437103271, "sampling/importance_sampling_ratio/min": 0.5947801470756531, "sampling/sampling_logp_difference/max": 0.5195634365081787, "sampling/sampling_logp_difference/mean": 0.014183470979332924, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3695865273475647, "epoch": 2.8676470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0166889012835278, "kl": 0.025707734748721123, "learning_rate": 6.3523857127021905e-09, "loss": 0.0002, "num_tokens": 73824715.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3675557374954224, "sampling/importance_sampling_ratio/mean": 1.0001277923583984, "sampling/importance_sampling_ratio/min": 0.6827847361564636, "sampling/sampling_logp_difference/max": 0.3815755844116211, "sampling/sampling_logp_difference/mean": 0.014930440112948418, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 188.46875, "completions/mean_terminated_length": 188.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.38538146018981934, "epoch": 2.868872549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.018286261951037185, "kl": 0.028369126841425896, "learning_rate": 6.239692136348284e-09, "loss": 0.0003, "num_tokens": 73857337.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.411484718322754, "sampling/importance_sampling_ratio/mean": 1.0004916191101074, "sampling/importance_sampling_ratio/min": 0.6207948327064514, "sampling/sampling_logp_difference/max": 0.47675466537475586, "sampling/sampling_logp_difference/mean": 0.014215872623026371, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 205.453125, "completions/mean_terminated_length": 205.453125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3979039788246155, "epoch": 2.8700980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.017187819769637448, "kl": 0.026213616132736206, "learning_rate": 6.12800086213866e-09, "loss": 0.0003, "num_tokens": 73889846.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4555035829544067, "sampling/importance_sampling_ratio/mean": 1.0002341270446777, "sampling/importance_sampling_ratio/min": 0.690625786781311, "sampling/sampling_logp_difference/max": 0.3753519058227539, "sampling/sampling_logp_difference/mean": 0.014101998880505562, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 209.453125, "completions/mean_terminated_length": 209.453125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4886457622051239, "epoch": 2.8713235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.031221469579826702, "kl": 0.049845144152641296, "learning_rate": 6.017312116799566e-09, "loss": 0.0006, "num_tokens": 73920291.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5176514387130737, "sampling/importance_sampling_ratio/mean": 0.9998294115066528, "sampling/importance_sampling_ratio/min": 0.6151835918426514, "sampling/sampling_logp_difference/max": 0.48583459854125977, "sampling/sampling_logp_difference/mean": 0.016400672495365143, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 135.6875, "completions/mean_terminated_length": 135.6875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3104097247123718, "epoch": 2.872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.02437347429895532, "kl": 0.029835911467671394, "learning_rate": 5.907626125022158e-09, "loss": 0.0003, "num_tokens": 73945487.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007180213928223, "sampling/importance_sampling_ratio/mean": 0.9998512864112854, "sampling/importance_sampling_ratio/min": 0.6778629422187805, "sampling/sampling_logp_difference/max": 0.47045230865478516, "sampling/sampling_logp_difference/mean": 0.014408271759748459, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 160.890625, "completions/mean_terminated_length": 160.890625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3479102849960327, "epoch": 2.873774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.022136621297683786, "kl": 0.025515876710414886, "learning_rate": 5.798943109461995e-09, "loss": 0.0002, "num_tokens": 73972104.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6368871927261353, "sampling/importance_sampling_ratio/mean": 0.9992555975914001, "sampling/importance_sampling_ratio/min": 0.6381741166114807, "sampling/sampling_logp_difference/max": 0.4927964210510254, "sampling/sampling_logp_difference/mean": 0.014105742797255516, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 198.609375, "completions/mean_terminated_length": 198.609375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3937147259712219, "epoch": 2.875, "frac_reward_zero_std": 0.75, "grad_norm": 0.8726929204105099, "kl": 0.0347832590341568, "learning_rate": 5.691263290738824e-09, "loss": 0.009, "num_tokens": 74004415.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5203908681869507, "sampling/importance_sampling_ratio/mean": 1.0003485679626465, "sampling/importance_sampling_ratio/min": 0.4887605905532837, "sampling/sampling_logp_difference/max": 0.7158825397491455, "sampling/sampling_logp_difference/mean": 0.014815937727689743, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 209.984375, "completions/mean_terminated_length": 209.984375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.31797924637794495, "epoch": 2.876225490196078, "frac_reward_zero_std": 1.0, "grad_norm": 0.01062699324841459, "kl": 0.014805572107434273, "learning_rate": 5.5845868874357385e-09, "loss": 0.0001, "num_tokens": 74039246.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5606545209884644, "sampling/importance_sampling_ratio/mean": 0.9998297095298767, "sampling/importance_sampling_ratio/min": 0.6799756288528442, "sampling/sampling_logp_difference/max": 0.44510531425476074, "sampling/sampling_logp_difference/mean": 0.01304752379655838, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 228.359375, "completions/mean_terminated_length": 228.359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4947429597377777, "epoch": 2.877450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.016340773511117686, "kl": 0.02197902649641037, "learning_rate": 5.4789141160991314e-09, "loss": 0.0002, "num_tokens": 74077061.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5613102912902832, "sampling/importance_sampling_ratio/mean": 1.0001163482666016, "sampling/importance_sampling_ratio/min": 0.6342008113861084, "sampling/sampling_logp_difference/max": 0.45538973808288574, "sampling/sampling_logp_difference/mean": 0.017344612628221512, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 175.84375, "completions/mean_terminated_length": 175.84375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.39359569549560547, "epoch": 2.8786764705882355, "frac_reward_zero_std": 0.75, "grad_norm": 0.9849048087035472, "kl": 0.047245509922504425, "learning_rate": 5.374245191238025e-09, "loss": -0.0041, "num_tokens": 74103291.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4255034923553467, "sampling/importance_sampling_ratio/mean": 0.9997348785400391, "sampling/importance_sampling_ratio/min": 0.6171954870223999, "sampling/sampling_logp_difference/max": 0.48256945610046387, "sampling/sampling_logp_difference/mean": 0.015792738646268845, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 189.703125, "completions/mean_terminated_length": 189.703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3153325021266937, "epoch": 2.8799019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.6795044307555955, "kl": 0.02400416135787964, "learning_rate": 5.270580325323681e-09, "loss": 0.0213, "num_tokens": 74134984.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5746557712554932, "sampling/importance_sampling_ratio/mean": 0.9998120069503784, "sampling/importance_sampling_ratio/min": 0.6600251793861389, "sampling/sampling_logp_difference/max": 0.4540367126464844, "sampling/sampling_logp_difference/mean": 0.011926532723009586, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.29931190609931946, "epoch": 2.881127450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.013776997919416478, "kl": 0.01953669637441635, "learning_rate": 5.167919728789271e-09, "loss": 0.0002, "num_tokens": 74164792.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4057984352111816, "sampling/importance_sampling_ratio/mean": 1.0004501342773438, "sampling/importance_sampling_ratio/min": 0.6177850961685181, "sampling/sampling_logp_difference/max": 0.4816145896911621, "sampling/sampling_logp_difference/mean": 0.013187164440751076, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3355157971382141, "epoch": 2.8823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.013278609391465654, "kl": 0.019463874399662018, "learning_rate": 5.0662636100292086e-09, "loss": 0.0002, "num_tokens": 74192504.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.466301679611206, "sampling/importance_sampling_ratio/mean": 1.0001624822616577, "sampling/importance_sampling_ratio/min": 0.658620297908783, "sampling/sampling_logp_difference/max": 0.4176081418991089, "sampling/sampling_logp_difference/mean": 0.014016900211572647, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3749135434627533, "epoch": 2.883578431372549, "frac_reward_zero_std": 1.0, "grad_norm": 0.015083554479084932, "kl": 0.023390740156173706, "learning_rate": 4.965612175399092e-09, "loss": 0.0002, "num_tokens": 74232696.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.617496132850647, "sampling/importance_sampling_ratio/mean": 0.99971604347229, "sampling/importance_sampling_ratio/min": 0.6912395358085632, "sampling/sampling_logp_difference/max": 0.4808793067932129, "sampling/sampling_logp_difference/mean": 0.013231323100626469, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 217.40625, "completions/mean_terminated_length": 217.40625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.375158429145813, "epoch": 2.8848039215686274, "frac_reward_zero_std": 0.75, "grad_norm": 0.7579212009089664, "kl": 0.034684307873249054, "learning_rate": 4.865965629214819e-09, "loss": 0.0146, "num_tokens": 74265970.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6056530475616455, "sampling/importance_sampling_ratio/mean": 0.9999037384986877, "sampling/importance_sampling_ratio/min": 0.6142159700393677, "sampling/sampling_logp_difference/max": 0.4874086380004883, "sampling/sampling_logp_difference/mean": 0.014152498915791512, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 212.359375, "completions/mean_terminated_length": 212.359375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3427656888961792, "epoch": 2.8860294117647056, "frac_reward_zero_std": 0.75, "grad_norm": 0.7357714851817373, "kl": 0.031769394874572754, "learning_rate": 4.767324173752696e-09, "loss": -0.0264, "num_tokens": 74294793.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4352306127548218, "sampling/importance_sampling_ratio/mean": 1.0003530979156494, "sampling/importance_sampling_ratio/min": 0.6146575212478638, "sampling/sampling_logp_difference/max": 0.48669004440307617, "sampling/sampling_logp_difference/mean": 0.01346497144550085, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 200.53125, "completions/mean_terminated_length": 200.53125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3489496111869812, "epoch": 2.8872549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.017132599119672152, "kl": 0.023278038948774338, "learning_rate": 4.669688009248607e-09, "loss": 0.0002, "num_tokens": 74327499.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467315912246704, "sampling/importance_sampling_ratio/mean": 1.0001590251922607, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.013841088861227036, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 208.296875, "completions/mean_terminated_length": 208.296875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3163909912109375, "epoch": 2.888480392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.014622783543643882, "kl": 0.02592034824192524, "learning_rate": 4.5730573338976786e-09, "loss": 0.0002, "num_tokens": 74355950.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5763987302780151, "sampling/importance_sampling_ratio/mean": 0.9997361302375793, "sampling/importance_sampling_ratio/min": 0.6836771368980408, "sampling/sampling_logp_difference/max": 0.4551429748535156, "sampling/sampling_logp_difference/mean": 0.01313449814915657, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 249.96875, "completions/mean_terminated_length": 249.96875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.323248028755188, "epoch": 2.889705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.03207862100978474, "kl": 0.036106012761592865, "learning_rate": 4.477432343854226e-09, "loss": 0.0004, "num_tokens": 74394956.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6612498760223389, "sampling/importance_sampling_ratio/mean": 1.0000250339508057, "sampling/importance_sampling_ratio/min": 0.4179401099681854, "sampling/sampling_logp_difference/max": 0.8724172115325928, "sampling/sampling_logp_difference/mean": 0.01283775083720684, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 217.734375, "completions/mean_terminated_length": 217.734375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.273295134305954, "epoch": 2.8909313725490198, "frac_reward_zero_std": 0.75, "grad_norm": 0.8761155246874087, "kl": 0.020199408754706383, "learning_rate": 4.382813233230698e-09, "loss": -0.0175, "num_tokens": 74425515.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6355791091918945, "sampling/importance_sampling_ratio/mean": 0.9998001456260681, "sampling/importance_sampling_ratio/min": 0.6581597924232483, "sampling/sampling_logp_difference/max": 0.49199700355529785, "sampling/sampling_logp_difference/mean": 0.011664999648928642, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 186.765625, "completions/mean_terminated_length": 186.765625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3464294672012329, "epoch": 2.892156862745098, "frac_reward_zero_std": 1.0, "grad_norm": 0.017237330228140245, "kl": 0.02178099937736988, "learning_rate": 4.289200194098119e-09, "loss": 0.0002, "num_tokens": 74457436.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5837095975875854, "sampling/importance_sampling_ratio/mean": 0.9997559189796448, "sampling/importance_sampling_ratio/min": 0.5434461832046509, "sampling/sampling_logp_difference/max": 0.6098246574401855, "sampling/sampling_logp_difference/mean": 0.013934292830526829, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.4872405230998993, "epoch": 2.8933823529411766, "frac_reward_zero_std": 0.75, "grad_norm": 0.7353622489765991, "kl": 0.09670692682266235, "learning_rate": 4.196593416484873e-09, "loss": 0.0149, "num_tokens": 74488004.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.7672679424285889, "sampling/importance_sampling_ratio/mean": 1.0000057220458984, "sampling/importance_sampling_ratio/min": 0.6771938800811768, "sampling/sampling_logp_difference/max": 0.5694348812103271, "sampling/sampling_logp_difference/mean": 0.016465749591588974, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 192.03125, "completions/mean_terminated_length": 192.03125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3528916835784912, "epoch": 2.894607843137255, "frac_reward_zero_std": 0.75, "grad_norm": 0.8473547012911232, "kl": 0.044261232018470764, "learning_rate": 4.104993088376974e-09, "loss": -0.0011, "num_tokens": 74515478.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.5398973226547241, "sampling/importance_sampling_ratio/mean": 1.0009403228759766, "sampling/importance_sampling_ratio/min": 0.637012243270874, "sampling/sampling_logp_difference/max": 0.45096635818481445, "sampling/sampling_logp_difference/mean": 0.014576076529920101, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 211.6875, "completions/mean_terminated_length": 211.6875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.47120723128318787, "epoch": 2.8958333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.9557901675410637, "kl": 0.058704107999801636, "learning_rate": 4.0143993957171826e-09, "loss": -0.0236, "num_tokens": 74553778.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4382609128952026, "sampling/importance_sampling_ratio/mean": 1.0003687143325806, "sampling/importance_sampling_ratio/min": 0.2649329602718353, "sampling/sampling_logp_difference/max": 1.3282785415649414, "sampling/sampling_logp_difference/mean": 0.01628001034259796, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 210.3125, "completions/mean_terminated_length": 210.3125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3334640860557556, "epoch": 2.8970588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.01333760190161096, "kl": 0.017741121351718903, "learning_rate": 3.924812522404952e-09, "loss": 0.0002, "num_tokens": 74589862.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007710695266724, "sampling/importance_sampling_ratio/mean": 1.0000438690185547, "sampling/importance_sampling_ratio/min": 0.6262631416320801, "sampling/sampling_logp_difference/max": 0.4704854488372803, "sampling/sampling_logp_difference/mean": 0.013768445700407028, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 231.015625, "completions/mean_terminated_length": 231.015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.31530332565307617, "epoch": 2.8982843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.7073960304740204, "kl": 0.027371685951948166, "learning_rate": 3.836232650296034e-09, "loss": -0.0103, "num_tokens": 74623879.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.2983132600784302, "sampling/importance_sampling_ratio/mean": 0.9995091557502747, "sampling/importance_sampling_ratio/min": 0.648907482624054, "sampling/sampling_logp_difference/max": 0.4324650764465332, "sampling/sampling_logp_difference/mean": 0.011638626456260681, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 199.5625, "completions/mean_terminated_length": 199.5625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.42891377210617065, "epoch": 2.8995098039215685, "frac_reward_zero_std": 0.75, "grad_norm": 0.9962438696300998, "kl": 0.05536392331123352, "learning_rate": 3.748659959201928e-09, "loss": 0.0191, "num_tokens": 74653419.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5813865661621094, "sampling/importance_sampling_ratio/mean": 1.0003461837768555, "sampling/importance_sampling_ratio/min": 0.6171907782554626, "sampling/sampling_logp_difference/max": 0.4825770854949951, "sampling/sampling_logp_difference/mean": 0.015942316502332687, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 172.546875, "completions/mean_terminated_length": 172.546875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3402683138847351, "epoch": 2.900735294117647, "frac_reward_zero_std": 0.75, "grad_norm": 1.1070267439657486, "kl": 0.03943384811282158, "learning_rate": 3.6620946268896556e-09, "loss": 0.0254, "num_tokens": 74678990.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9991967678070068, "sampling/importance_sampling_ratio/min": 0.6189759373664856, "sampling/sampling_logp_difference/max": 0.709916353225708, "sampling/sampling_logp_difference/mean": 0.015335087664425373, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 212.890625, "completions/mean_terminated_length": 212.890625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.28859928250312805, "epoch": 2.9019607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.011678689245716639, "kl": 0.01910664513707161, "learning_rate": 3.5765368290813223e-09, "loss": 0.0002, "num_tokens": 74711639.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3941144943237305, "sampling/importance_sampling_ratio/mean": 0.9995889663696289, "sampling/importance_sampling_ratio/min": 0.6953766942024231, "sampling/sampling_logp_difference/max": 0.36330151557922363, "sampling/sampling_logp_difference/mean": 0.01146257109940052, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 186.421875, "completions/mean_terminated_length": 186.421875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.38304442167282104, "epoch": 2.903186274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.016711314854557412, "kl": 0.02678157389163971, "learning_rate": 3.491986739453889e-09, "loss": 0.0003, "num_tokens": 74743730.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6008230447769165, "sampling/importance_sampling_ratio/mean": 0.9997867345809937, "sampling/importance_sampling_ratio/min": 0.607703685760498, "sampling/sampling_logp_difference/max": 0.49806785583496094, "sampling/sampling_logp_difference/mean": 0.015621802769601345, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 144.78125, "completions/mean_terminated_length": 144.78125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.30216503143310547, "epoch": 2.9044117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.01780466805583866, "kl": 0.025277363136410713, "learning_rate": 3.4084445296386767e-09, "loss": 0.0002, "num_tokens": 74773316.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4464720487594604, "sampling/importance_sampling_ratio/mean": 1.0005719661712646, "sampling/importance_sampling_ratio/min": 0.6782976388931274, "sampling/sampling_logp_difference/max": 0.3881690502166748, "sampling/sampling_logp_difference/mean": 0.01319819875061512, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 197.171875, "completions/mean_terminated_length": 197.171875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3905153274536133, "epoch": 2.905637254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.019401886283428777, "kl": 0.027070432901382446, "learning_rate": 3.3259103692209745e-09, "loss": 0.0003, "num_tokens": 74803295.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5972893238067627, "sampling/importance_sampling_ratio/mean": 0.9990304112434387, "sampling/importance_sampling_ratio/min": 0.5910945534706116, "sampling/sampling_logp_difference/max": 0.5257792472839355, "sampling/sampling_logp_difference/mean": 0.016149362549185753, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 197.3125, "completions/mean_terminated_length": 197.3125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3718438148498535, "epoch": 2.906862745098039, "frac_reward_zero_std": 1.0, "grad_norm": 0.023195987676754584, "kl": 0.031065698713064194, "learning_rate": 3.2443844257400434e-09, "loss": 0.0003, "num_tokens": 74839683.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5779024362564087, "sampling/importance_sampling_ratio/mean": 1.0002520084381104, "sampling/importance_sampling_ratio/min": 0.6434327363967896, "sampling/sampling_logp_difference/max": 0.4560964107513428, "sampling/sampling_logp_difference/mean": 0.015002873726189137, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.35181066393852234, "epoch": 2.9080882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.1243511056485311, "kl": 0.0297673549503088, "learning_rate": 3.163866864688336e-09, "loss": -0.0024, "num_tokens": 74872547.0, "reward": 0.03125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5072041749954224, "sampling/importance_sampling_ratio/mean": 1.0001254081726074, "sampling/importance_sampling_ratio/min": 0.5031969547271729, "sampling/sampling_logp_difference/max": 0.6867736577987671, "sampling/sampling_logp_difference/mean": 0.013822752982378006, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 238.78125, "completions/mean_terminated_length": 238.78125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3895314335823059, "epoch": 2.909313725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.014679421412962962, "kl": 0.018656454980373383, "learning_rate": 3.0843578495113877e-09, "loss": 0.0002, "num_tokens": 74906645.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4728115797042847, "sampling/importance_sampling_ratio/mean": 0.9996693134307861, "sampling/importance_sampling_ratio/min": 0.5432451963424683, "sampling/sampling_logp_difference/max": 0.6101944446563721, "sampling/sampling_logp_difference/mean": 0.014664944261312485, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 176.171875, "completions/mean_terminated_length": 176.171875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3867795765399933, "epoch": 2.9105392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.8642869450296733, "kl": 0.10119754076004028, "learning_rate": 3.0058575416073707e-09, "loss": -0.0125, "num_tokens": 74934032.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6431214809417725, "sampling/importance_sampling_ratio/mean": 0.9997879266738892, "sampling/importance_sampling_ratio/min": 0.6962983012199402, "sampling/sampling_logp_difference/max": 0.4965977668762207, "sampling/sampling_logp_difference/mean": 0.015408418141305447, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 163.5625, "completions/mean_terminated_length": 163.5625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3355710208415985, "epoch": 2.911764705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.9124505186964353, "kl": 0.02697381004691124, "learning_rate": 2.9283661003270952e-09, "loss": 0.0412, "num_tokens": 74962628.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5278233289718628, "sampling/importance_sampling_ratio/mean": 1.0000895261764526, "sampling/importance_sampling_ratio/min": 0.6484197974205017, "sampling/sampling_logp_difference/max": 0.43321692943573, "sampling/sampling_logp_difference/mean": 0.014956897124648094, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 199.328125, "completions/mean_terminated_length": 199.328125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.38779014348983765, "epoch": 2.9129901960784315, "frac_reward_zero_std": 0.75, "grad_norm": 0.7607617709264936, "kl": 0.036527037620544434, "learning_rate": 2.851883682973233e-09, "loss": 0.0184, "num_tokens": 74996057.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.2827922105789185, "sampling/importance_sampling_ratio/mean": 0.9993733763694763, "sampling/importance_sampling_ratio/min": 0.6301738619804382, "sampling/sampling_logp_difference/max": 0.4617595672607422, "sampling/sampling_logp_difference/mean": 0.013738825917243958, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 226.3125, "completions/mean_terminated_length": 226.3125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.28482919931411743, "epoch": 2.9142156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.01537201761713231, "kl": 0.024184320122003555, "learning_rate": 2.776410444800148e-09, "loss": 0.0003, "num_tokens": 75030781.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4735807180404663, "sampling/importance_sampling_ratio/mean": 0.9999432563781738, "sampling/importance_sampling_ratio/min": 0.6046412587165833, "sampling/sampling_logp_difference/max": 0.503119945526123, "sampling/sampling_logp_difference/mean": 0.01173341367393732, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 184.28125, "completions/mean_terminated_length": 184.28125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3381809890270233, "epoch": 2.9154411764705883, "frac_reward_zero_std": 1.0, "grad_norm": 0.014636593004207615, "kl": 0.021457120776176453, "learning_rate": 2.701946539013844e-09, "loss": 0.0002, "num_tokens": 75060031.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.530226230621338, "sampling/importance_sampling_ratio/mean": 0.9997565746307373, "sampling/importance_sampling_ratio/min": 0.675378680229187, "sampling/sampling_logp_difference/max": 0.4254155158996582, "sampling/sampling_logp_difference/mean": 0.014186479151248932, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 183.390625, "completions/mean_terminated_length": 183.390625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.36331191658973694, "epoch": 2.9166666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.012508191222585525, "kl": 0.020261242985725403, "learning_rate": 2.628492116771297e-09, "loss": 0.0002, "num_tokens": 75090312.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6600890159606934, "sampling/importance_sampling_ratio/mean": 1.0003174543380737, "sampling/importance_sampling_ratio/min": 0.6446958184242249, "sampling/sampling_logp_difference/max": 0.506871223449707, "sampling/sampling_logp_difference/mean": 0.01493038795888424, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 212.640625, "completions/mean_terminated_length": 212.640625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.5538235902786255, "epoch": 2.917892156862745, "frac_reward_zero_std": 0.5, "grad_norm": 1.448151904184562, "kl": 0.057340092957019806, "learning_rate": 2.556047327180344e-09, "loss": -0.0416, "num_tokens": 75119937.0, "reward": 0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4229785203933716, "sampling/importance_sampling_ratio/mean": 1.0000115633010864, "sampling/importance_sampling_ratio/min": 0.6484609842300415, "sampling/sampling_logp_difference/max": 0.4331533908843994, "sampling/sampling_logp_difference/mean": 0.019155602902173996, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 182.578125, "completions/mean_terminated_length": 182.578125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.48329854011535645, "epoch": 2.9191176470588234, "frac_reward_zero_std": 0.75, "grad_norm": 0.8522176133793282, "kl": 0.0585813894867897, "learning_rate": 2.484612317299295e-09, "loss": 0.0062, "num_tokens": 75149494.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000645399093628, "sampling/importance_sampling_ratio/min": 0.6106173992156982, "sampling/sampling_logp_difference/max": 0.8198103904724121, "sampling/sampling_logp_difference/mean": 0.01875029131770134, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 208.8125, "completions/mean_terminated_length": 208.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3520786762237549, "epoch": 2.920343137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.08546723648294711, "kl": 0.06635012477636337, "learning_rate": 2.4141872321367107e-09, "loss": 0.0006, "num_tokens": 75178906.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4360222816467285, "sampling/importance_sampling_ratio/mean": 1.0002968311309814, "sampling/importance_sampling_ratio/min": 0.70079505443573, "sampling/sampling_logp_difference/max": 0.3618769645690918, "sampling/sampling_logp_difference/mean": 0.013945094309747219, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3963060975074768, "epoch": 2.9215686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.015982183017377614, "kl": 0.02367197349667549, "learning_rate": 2.344772214651014e-09, "loss": 0.0002, "num_tokens": 75214250.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6598310470581055, "sampling/importance_sampling_ratio/mean": 0.9992620944976807, "sampling/importance_sampling_ratio/min": 0.6274728178977966, "sampling/sampling_logp_difference/max": 0.5067157745361328, "sampling/sampling_logp_difference/mean": 0.016099225729703903, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 207.4375, "completions/mean_terminated_length": 207.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2980409264564514, "epoch": 2.922794117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.021820451302088827, "kl": 0.020066574215888977, "learning_rate": 2.2763674057503235e-09, "loss": 0.0002, "num_tokens": 75251766.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4847639799118042, "sampling/importance_sampling_ratio/mean": 1.0004727840423584, "sampling/importance_sampling_ratio/min": 0.7234383225440979, "sampling/sampling_logp_difference/max": 0.39525580406188965, "sampling/sampling_logp_difference/mean": 0.012051548808813095, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 202.546875, "completions/mean_terminated_length": 202.546875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.3355339765548706, "epoch": 2.924019607843137, "frac_reward_zero_std": 0.75, "grad_norm": 0.8187199074871825, "kl": 0.03872930258512497, "learning_rate": 2.20897294429212e-09, "loss": -0.0108, "num_tokens": 75282153.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5993927717208862, "sampling/importance_sampling_ratio/mean": 0.9999510049819946, "sampling/importance_sampling_ratio/min": 0.6995729207992554, "sampling/sampling_logp_difference/max": 0.46962404251098633, "sampling/sampling_logp_difference/mean": 0.012686062604188919, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 231.59375, "completions/mean_terminated_length": 231.59375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3886439800262451, "epoch": 2.9252450980392157, "frac_reward_zero_std": 1.0, "grad_norm": 0.013960937498158493, "kl": 0.02158696949481964, "learning_rate": 2.142588967082748e-09, "loss": 0.0002, "num_tokens": 75316847.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.774400234222412, "sampling/importance_sampling_ratio/mean": 1.0002013444900513, "sampling/importance_sampling_ratio/min": 0.6808310747146606, "sampling/sampling_logp_difference/max": 0.5734624862670898, "sampling/sampling_logp_difference/mean": 0.014919614419341087, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3687131404876709, "epoch": 2.9264705882352944, "frac_reward_zero_std": 1.0, "grad_norm": 0.025827776395270535, "kl": 0.032721925526857376, "learning_rate": 2.0772156088776913e-09, "loss": 0.0003, "num_tokens": 75346155.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4881680011749268, "sampling/importance_sampling_ratio/mean": 1.0002375841140747, "sampling/importance_sampling_ratio/min": 0.6254510879516602, "sampling/sampling_logp_difference/max": 0.4692821502685547, "sampling/sampling_logp_difference/mean": 0.014783652499318123, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 203.765625, "completions/mean_terminated_length": 203.765625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4356057047843933, "epoch": 2.9276960784313726, "frac_reward_zero_std": 0.5, "grad_norm": 1.1822634605331237, "kl": 0.04897632822394371, "learning_rate": 2.0128530023804656e-09, "loss": 0.0146, "num_tokens": 75378860.0, "reward": 0.75, "reward_std": 0.3811737596988678, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4639828205108643, "sampling/importance_sampling_ratio/mean": 0.9999449849128723, "sampling/importance_sampling_ratio/min": 0.6783303618431091, "sampling/sampling_logp_difference/max": 0.3881208896636963, "sampling/sampling_logp_difference/mean": 0.015538797713816166, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 225.828125, "completions/mean_terminated_length": 225.828125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3999805748462677, "epoch": 2.928921568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.017521391739172297, "kl": 0.024150025099515915, "learning_rate": 1.9495012782433375e-09, "loss": 0.0002, "num_tokens": 75415921.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4013463258743286, "sampling/importance_sampling_ratio/mean": 0.9993059635162354, "sampling/importance_sampling_ratio/min": 0.7225843071937561, "sampling/sampling_logp_difference/max": 0.33743345737457275, "sampling/sampling_logp_difference/mean": 0.014675119891762733, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 189.90625, "completions/mean_terminated_length": 189.90625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.32983535528182983, "epoch": 2.9301470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014103971751140148, "kl": 0.023350298404693604, "learning_rate": 1.887160565066048e-09, "loss": 0.0002, "num_tokens": 75445355.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6090011596679688, "sampling/importance_sampling_ratio/mean": 0.9996578693389893, "sampling/importance_sampling_ratio/min": 0.6511370539665222, "sampling/sampling_logp_difference/max": 0.4756135940551758, "sampling/sampling_logp_difference/mean": 0.014014555141329765, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.2617335915565491, "epoch": 2.931372549019608, "frac_reward_zero_std": 1.0, "grad_norm": 0.01586684245297235, "kl": 0.02428416907787323, "learning_rate": 1.8258309893965374e-09, "loss": 0.0003, "num_tokens": 75482235.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4298864603042603, "sampling/importance_sampling_ratio/mean": 1.0006804466247559, "sampling/importance_sampling_ratio/min": 0.7104549407958984, "sampling/sampling_logp_difference/max": 0.35759496688842773, "sampling/sampling_logp_difference/mean": 0.010682035237550735, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 170.484375, "completions/mean_terminated_length": 170.484375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3657550513744354, "epoch": 2.9325980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02114455024497704, "kl": 0.02423708140850067, "learning_rate": 1.7655126757297744e-09, "loss": 0.0002, "num_tokens": 75511706.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.403576135635376, "sampling/importance_sampling_ratio/mean": 0.9998015761375427, "sampling/importance_sampling_ratio/min": 0.5362949371337891, "sampling/sampling_logp_difference/max": 0.6230709552764893, "sampling/sampling_logp_difference/mean": 0.014422647655010223, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 196.265625, "completions/mean_terminated_length": 196.265625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.3682851493358612, "epoch": 2.9338235294117645, "frac_reward_zero_std": 1.0, "grad_norm": 0.020510558490007304, "kl": 0.037223272025585175, "learning_rate": 1.7062057465082046e-09, "loss": 0.0004, "num_tokens": 75542395.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4761900901794434, "sampling/importance_sampling_ratio/mean": 0.9994564056396484, "sampling/importance_sampling_ratio/min": 0.2961808741092682, "sampling/sampling_logp_difference/max": 1.216784954071045, "sampling/sampling_logp_difference/mean": 0.01482559833675623, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 193.015625, "completions/mean_terminated_length": 193.015625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3677944540977478, "epoch": 2.935049019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.015931602905119236, "kl": 0.025228401646018028, "learning_rate": 1.6479103221211377e-09, "loss": 0.0002, "num_tokens": 75574124.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4348173141479492, "sampling/importance_sampling_ratio/mean": 0.9999492764472961, "sampling/importance_sampling_ratio/min": 0.6410535573959351, "sampling/sampling_logp_difference/max": 0.4446423053741455, "sampling/sampling_logp_difference/mean": 0.013891877606511116, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 222.828125, "completions/mean_terminated_length": 222.828125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.410091757774353, "epoch": 2.936274509803922, "frac_reward_zero_std": 0.75, "grad_norm": 0.7605047428407047, "kl": 0.023822134360671043, "learning_rate": 1.5906265209045254e-09, "loss": 0.0033, "num_tokens": 75606033.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.4722143411636353, "sampling/importance_sampling_ratio/mean": 0.9998899698257446, "sampling/importance_sampling_ratio/min": 0.726360559463501, "sampling/sampling_logp_difference/max": 0.3867676258087158, "sampling/sampling_logp_difference/mean": 0.014993167482316494, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 204.984375, "completions/mean_terminated_length": 204.984375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3174899220466614, "epoch": 2.9375, "frac_reward_zero_std": 0.75, "grad_norm": 0.8137996587007464, "kl": 0.022347180172801018, "learning_rate": 1.534354459140963e-09, "loss": 0.002, "num_tokens": 75633344.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2788511514663696, "sampling/importance_sampling_ratio/mean": 0.9993703365325928, "sampling/importance_sampling_ratio/min": 0.6783400774002075, "sampling/sampling_logp_difference/max": 0.38810646533966064, "sampling/sampling_logp_difference/mean": 0.012044595554471016, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 194.0625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3399444818496704, "epoch": 2.938725490196078, "frac_reward_zero_std": 0.75, "grad_norm": 0.8415715402340155, "kl": 0.03094661235809326, "learning_rate": 1.4790942510590766e-09, "loss": -0.0217, "num_tokens": 75662676.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.626960277557373, "sampling/importance_sampling_ratio/mean": 1.0001401901245117, "sampling/importance_sampling_ratio/min": 0.6171442866325378, "sampling/sampling_logp_difference/max": 0.4867134094238281, "sampling/sampling_logp_difference/mean": 0.014751039445400238, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 185.984375, "completions/mean_terminated_length": 185.984375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3124786615371704, "epoch": 2.939950980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.9318305373316634, "kl": 0.027180973440408707, "learning_rate": 1.4248460088335801e-09, "loss": -0.0384, "num_tokens": 75691027.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3593418598175049, "sampling/importance_sampling_ratio/mean": 1.0002610683441162, "sampling/importance_sampling_ratio/min": 0.6063899993896484, "sampling/sampling_logp_difference/max": 0.5002319812774658, "sampling/sampling_logp_difference/mean": 0.01331777311861515, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3510400056838989, "epoch": 2.9411764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.016127073630881388, "kl": 0.023105358704924583, "learning_rate": 1.371609842585053e-09, "loss": 0.0002, "num_tokens": 75723395.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5568342208862305, "sampling/importance_sampling_ratio/mean": 1.0002357959747314, "sampling/importance_sampling_ratio/min": 0.6176312565803528, "sampling/sampling_logp_difference/max": 0.48186373710632324, "sampling/sampling_logp_difference/mean": 0.014547735452651978, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 282.859375, "completions/mean_terminated_length": 282.859375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.5836253762245178, "epoch": 2.9424019607843137, "frac_reward_zero_std": 0.5, "grad_norm": 0.7789305490925427, "kl": 0.05978118255734444, "learning_rate": 1.319385860379496e-09, "loss": -0.0128, "num_tokens": 75763802.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5122863054275513, "sampling/importance_sampling_ratio/mean": 0.9999563097953796, "sampling/importance_sampling_ratio/min": 0.6250794529914856, "sampling/sampling_logp_difference/max": 0.4698765277862549, "sampling/sampling_logp_difference/mean": 0.01819756254553795, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 227.0625, "completions/mean_terminated_length": 227.0625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3757852017879486, "epoch": 2.943627450980392, "frac_reward_zero_std": 1.0, "grad_norm": 0.021447603274242773, "kl": 0.025004444643855095, "learning_rate": 1.2681741682282754e-09, "loss": 0.0002, "num_tokens": 75792974.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.592671275138855, "sampling/importance_sampling_ratio/mean": 0.9996393918991089, "sampling/importance_sampling_ratio/min": 0.6106353402137756, "sampling/sampling_logp_difference/max": 0.4932553768157959, "sampling/sampling_logp_difference/mean": 0.014496829360723495, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 241.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3505290746688843, "epoch": 2.9448529411764706, "frac_reward_zero_std": 0.75, "grad_norm": 0.758188779896263, "kl": 0.041052378714084625, "learning_rate": 1.217974870087901e-09, "loss": 0.0243, "num_tokens": 75826094.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5744353532791138, "sampling/importance_sampling_ratio/mean": 0.9998817443847656, "sampling/importance_sampling_ratio/min": 0.6181427240371704, "sampling/sampling_logp_difference/max": 0.4810359477996826, "sampling/sampling_logp_difference/mean": 0.013801928609609604, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 198.296875, "completions/mean_terminated_length": 198.296875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.5521295070648193, "epoch": 2.946078431372549, "frac_reward_zero_std": 0.5, "grad_norm": 1.212770049450111, "kl": 0.05674951523542404, "learning_rate": 1.1687880678596939e-09, "loss": -0.0075, "num_tokens": 75864305.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.461962103843689, "sampling/importance_sampling_ratio/mean": 1.0005958080291748, "sampling/importance_sampling_ratio/min": 0.6441908478736877, "sampling/sampling_logp_difference/max": 0.4397602081298828, "sampling/sampling_logp_difference/mean": 0.018468894064426422, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 165.1875, "completions/mean_terminated_length": 165.1875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.3110926151275635, "epoch": 2.9473039215686274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0208300178904979, "kl": 0.02462848648428917, "learning_rate": 1.1206138613898962e-09, "loss": 0.0002, "num_tokens": 75890061.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5485175848007202, "sampling/importance_sampling_ratio/mean": 1.0005801916122437, "sampling/importance_sampling_ratio/min": 0.7139657139778137, "sampling/sampling_logp_difference/max": 0.437298059463501, "sampling/sampling_logp_difference/mean": 0.013580387458205223, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.31713682413101196, "epoch": 2.9485294117647056, "frac_reward_zero_std": 1.0, "grad_norm": 0.019186981424148625, "kl": 0.025584455579519272, "learning_rate": 1.0734523484689507e-09, "loss": 0.0003, "num_tokens": 75921037.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.600778341293335, "sampling/importance_sampling_ratio/mean": 1.0001356601715088, "sampling/importance_sampling_ratio/min": 0.622538149356842, "sampling/sampling_logp_difference/max": 0.4739503860473633, "sampling/sampling_logp_difference/mean": 0.01394292339682579, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 203.53125, "completions/mean_terminated_length": 203.53125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3787577748298645, "epoch": 2.9497549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.013635619765529598, "kl": 0.020073357969522476, "learning_rate": 1.0273036248318324e-09, "loss": 0.0002, "num_tokens": 75952031.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3519020080566406, "sampling/importance_sampling_ratio/mean": 0.9997328519821167, "sampling/importance_sampling_ratio/min": 0.6203981637954712, "sampling/sampling_logp_difference/max": 0.47739386558532715, "sampling/sampling_logp_difference/mean": 0.0135762644931674, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 216.046875, "completions/mean_terminated_length": 216.046875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3667415976524353, "epoch": 2.950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0279535717611835, "kl": 0.03812876343727112, "learning_rate": 9.82167784157495e-10, "loss": 0.0003, "num_tokens": 75981202.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.413762092590332, "sampling/importance_sampling_ratio/mean": 0.9999436140060425, "sampling/importance_sampling_ratio/min": 0.6165905594825745, "sampling/sampling_logp_difference/max": 0.4835500717163086, "sampling/sampling_logp_difference/mean": 0.013796709477901459, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 195.453125, "completions/mean_terminated_length": 195.453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.41977283358573914, "epoch": 2.952205882352941, "frac_reward_zero_std": 0.75, "grad_norm": 0.8157663726977482, "kl": 0.14047285914421082, "learning_rate": 9.380449180688143e-10, "loss": 0.0211, "num_tokens": 76010527.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.545351266860962, "sampling/importance_sampling_ratio/mean": 1.0002124309539795, "sampling/importance_sampling_ratio/min": 0.7128937244415283, "sampling/sampling_logp_difference/max": 0.43525123596191406, "sampling/sampling_logp_difference/mean": 0.015495553612709045, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 222.78125, "completions/mean_terminated_length": 222.78125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.32274726033210754, "epoch": 2.9534313725490198, "frac_reward_zero_std": 1.0, "grad_norm": 0.012670030737707082, "kl": 0.015602516010403633, "learning_rate": 8.949351161324225e-10, "loss": 0.0002, "num_tokens": 76046641.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4755103588104248, "sampling/importance_sampling_ratio/mean": 1.0002577304840088, "sampling/importance_sampling_ratio/min": 0.6246485710144043, "sampling/sampling_logp_difference/max": 0.4705660343170166, "sampling/sampling_logp_difference/mean": 0.0123797208070755, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 157.46875, "completions/mean_terminated_length": 157.46875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3183969557285309, "epoch": 2.954656862745098, "frac_reward_zero_std": 0.75, "grad_norm": 0.9697665139391681, "kl": 0.02980830706655979, "learning_rate": 8.528384658584853e-10, "loss": -0.0261, "num_tokens": 76071455.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4153097867965698, "sampling/importance_sampling_ratio/mean": 0.9993491172790527, "sampling/importance_sampling_ratio/min": 0.6604135632514954, "sampling/sampling_logp_difference/max": 0.4148890972137451, "sampling/sampling_logp_difference/mean": 0.013639282435178757, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 212.640625, "completions/mean_terminated_length": 212.640625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.33786875009536743, "epoch": 2.9558823529411766, "frac_reward_zero_std": 1.0, "grad_norm": 0.020922592339934605, "kl": 0.027387060225009918, "learning_rate": 8.117550527005912e-10, "loss": 0.0003, "num_tokens": 76101128.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.659095048904419, "sampling/importance_sampling_ratio/mean": 1.0002970695495605, "sampling/importance_sampling_ratio/min": 0.6153237819671631, "sampling/sampling_logp_difference/max": 0.5062723159790039, "sampling/sampling_logp_difference/mean": 0.012486254796385765, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 170.1875, "completions/mean_terminated_length": 170.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.4340662360191345, "epoch": 2.957107843137255, "frac_reward_zero_std": 1.0, "grad_norm": 0.03034505904552388, "kl": 0.05263839662075043, "learning_rate": 7.716849600554188e-10, "loss": 0.0005, "num_tokens": 76129108.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4997516870498657, "sampling/importance_sampling_ratio/mean": 1.000171184539795, "sampling/importance_sampling_ratio/min": 0.6809601187705994, "sampling/sampling_logp_difference/max": 0.4052995443344116, "sampling/sampling_logp_difference/mean": 0.016778860241174698, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 201.8125, "completions/mean_terminated_length": 201.8125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3196262717247009, "epoch": 2.9583333333333335, "frac_reward_zero_std": 0.75, "grad_norm": 0.7345859601982961, "kl": 0.0631108433008194, "learning_rate": 7.326282692626806e-10, "loss": 0.0169, "num_tokens": 76156360.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.7401350736618042, "sampling/importance_sampling_ratio/mean": 1.0001612901687622, "sampling/importance_sampling_ratio/min": 0.5697087049484253, "sampling/sampling_logp_difference/max": 0.5626300573348999, "sampling/sampling_logp_difference/mean": 0.012970691546797752, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 250.78125, "completions/mean_terminated_length": 250.78125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.4877535104751587, "epoch": 2.9595588235294117, "frac_reward_zero_std": 0.75, "grad_norm": 0.654849821949973, "kl": 0.03357920050621033, "learning_rate": 6.945850596050684e-10, "loss": 0.0072, "num_tokens": 76188730.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5156501531600952, "sampling/importance_sampling_ratio/mean": 0.9997298717498779, "sampling/importance_sampling_ratio/min": 0.660491943359375, "sampling/sampling_logp_difference/max": 0.4158444404602051, "sampling/sampling_logp_difference/mean": 0.016852904111146927, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 180.421875, "completions/mean_terminated_length": 180.421875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3513135313987732, "epoch": 2.9607843137254903, "frac_reward_zero_std": 0.75, "grad_norm": 0.9464599334498442, "kl": 0.04228626936674118, "learning_rate": 6.575554083078083e-10, "loss": 0.0394, "num_tokens": 76216757.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.612879991531372, "sampling/importance_sampling_ratio/mean": 1.00016450881958, "sampling/importance_sampling_ratio/min": 0.5362210273742676, "sampling/sampling_logp_difference/max": 0.62320876121521, "sampling/sampling_logp_difference/mean": 0.014640099368989468, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 170.515625, "completions/mean_terminated_length": 170.515625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.29498258233070374, "epoch": 2.9620098039215685, "frac_reward_zero_std": 1.0, "grad_norm": 0.015930040609186152, "kl": 0.022072361782193184, "learning_rate": 6.215393905388278e-10, "loss": 0.0002, "num_tokens": 76243846.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5760538578033447, "sampling/importance_sampling_ratio/mean": 1.0002655982971191, "sampling/importance_sampling_ratio/min": 0.614662230014801, "sampling/sampling_logp_difference/max": 0.4866824150085449, "sampling/sampling_logp_difference/mean": 0.013572830706834793, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 209.9375, "completions/mean_terminated_length": 209.9375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.30850571393966675, "epoch": 2.963235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01826592571893516, "kl": 0.024756554514169693, "learning_rate": 5.865370794082558e-10, "loss": 0.0002, "num_tokens": 76275330.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4055031538009644, "sampling/importance_sampling_ratio/mean": 1.0001401901245117, "sampling/importance_sampling_ratio/min": 0.6222241520881653, "sampling/sampling_logp_difference/max": 0.4744548797607422, "sampling/sampling_logp_difference/mean": 0.01286984235048294, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 196.625, "completions/mean_terminated_length": 196.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3975851535797119, "epoch": 2.9644607843137254, "frac_reward_zero_std": 1.0, "grad_norm": 0.020887113552843894, "kl": 0.028526470065116882, "learning_rate": 5.525485459687007e-10, "loss": 0.0003, "num_tokens": 76305338.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6464240550994873, "sampling/importance_sampling_ratio/mean": 1.0008063316345215, "sampling/importance_sampling_ratio/min": 0.6181831359863281, "sampling/sampling_logp_difference/max": 0.49860572814941406, "sampling/sampling_logp_difference/mean": 0.016536317765712738, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 239.671875, "completions/mean_terminated_length": 239.671875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.37244483828544617, "epoch": 2.965686274509804, "frac_reward_zero_std": 1.0, "grad_norm": 0.013444198409433008, "kl": 0.018550623208284378, "learning_rate": 5.195738592145838e-10, "loss": 0.0002, "num_tokens": 76350549.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7199466228485107, "sampling/importance_sampling_ratio/mean": 0.9998754858970642, "sampling/importance_sampling_ratio/min": 0.678856611251831, "sampling/sampling_logp_difference/max": 0.5422933101654053, "sampling/sampling_logp_difference/mean": 0.014810443855822086, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 237.78125, "completions/mean_terminated_length": 237.78125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.36133065819740295, "epoch": 2.9669117647058822, "frac_reward_zero_std": 1.0, "grad_norm": 0.015155280847526294, "kl": 0.02569812536239624, "learning_rate": 4.876130860825278e-10, "loss": 0.0003, "num_tokens": 76386711.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5359033346176147, "sampling/importance_sampling_ratio/mean": 1.0001566410064697, "sampling/importance_sampling_ratio/min": 0.6060562133789062, "sampling/sampling_logp_difference/max": 0.5007824897766113, "sampling/sampling_logp_difference/mean": 0.013830517418682575, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3214268684387207, "epoch": 2.968137254901961, "frac_reward_zero_std": 1.0, "grad_norm": 0.01717020213442181, "kl": 0.020274590700864792, "learning_rate": 4.566662914508579e-10, "loss": 0.0002, "num_tokens": 76415615.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3767173290252686, "sampling/importance_sampling_ratio/mean": 0.9997262954711914, "sampling/importance_sampling_ratio/min": 0.4835146367549896, "sampling/sampling_logp_difference/max": 0.7266737222671509, "sampling/sampling_logp_difference/mean": 0.014127014204859734, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 223.078125, "completions/mean_terminated_length": 223.078125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.35409465432167053, "epoch": 2.969362745098039, "frac_reward_zero_std": 0.75, "grad_norm": 0.825861705013576, "kl": 0.034467682242393494, "learning_rate": 4.267335381396564e-10, "loss": 0.0101, "num_tokens": 76454516.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.384948492050171, "sampling/importance_sampling_ratio/mean": 1.0002353191375732, "sampling/importance_sampling_ratio/min": 0.6298385858535767, "sampling/sampling_logp_difference/max": 0.4622917175292969, "sampling/sampling_logp_difference/mean": 0.013544151559472084, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3696131110191345, "epoch": 2.9705882352941178, "frac_reward_zero_std": 0.5, "grad_norm": 1.2415214478536805, "kl": 0.06753590703010559, "learning_rate": 3.978148869103748e-10, "loss": 0.0131, "num_tokens": 76484764.0, "reward": 0.3125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.5278277397155762, "sampling/importance_sampling_ratio/mean": 0.9996430277824402, "sampling/importance_sampling_ratio/min": 0.6368826031684875, "sampling/sampling_logp_difference/max": 0.4511699676513672, "sampling/sampling_logp_difference/mean": 0.014389928430318832, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 159.84375, "completions/mean_terminated_length": 159.84375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.35769280791282654, "epoch": 2.971813725490196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01811837475727411, "kl": 0.02568848803639412, "learning_rate": 3.699103964661665e-10, "loss": 0.0003, "num_tokens": 76526946.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.564016580581665, "sampling/importance_sampling_ratio/mean": 0.9992374181747437, "sampling/importance_sampling_ratio/min": 0.6108652353286743, "sampling/sampling_logp_difference/max": 0.49287891387939453, "sampling/sampling_logp_difference/mean": 0.014937417581677437, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 160.6875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.36411798000335693, "epoch": 2.9730392156862746, "frac_reward_zero_std": 0.75, "grad_norm": 0.9997063482295577, "kl": 0.059302765876054764, "learning_rate": 3.430201234513874e-10, "loss": 0.0068, "num_tokens": 76550814.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4970269203186035, "sampling/importance_sampling_ratio/mean": 1.0000662803649902, "sampling/importance_sampling_ratio/min": 0.5362949371337891, "sampling/sampling_logp_difference/max": 0.6230709552764893, "sampling/sampling_logp_difference/mean": 0.015909433364868164, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 200.65625, "completions/mean_terminated_length": 200.65625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.39167535305023193, "epoch": 2.974264705882353, "frac_reward_zero_std": 0.75, "grad_norm": 0.9152092662765233, "kl": 0.03770778328180313, "learning_rate": 3.171441224514848e-10, "loss": -0.0107, "num_tokens": 76582056.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.416238784790039, "sampling/importance_sampling_ratio/mean": 1.0002343654632568, "sampling/importance_sampling_ratio/min": 0.6949491500854492, "sampling/sampling_logp_difference/max": 0.36391663551330566, "sampling/sampling_logp_difference/mean": 0.015838623046875, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 210.21875, "completions/mean_terminated_length": 210.21875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3457028567790985, "epoch": 2.9754901960784315, "frac_reward_zero_std": 1.0, "grad_norm": 0.014345896259431879, "kl": 0.019017256796360016, "learning_rate": 2.922824459931639e-10, "loss": 0.0002, "num_tokens": 76616086.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4058324098587036, "sampling/importance_sampling_ratio/mean": 1.0000433921813965, "sampling/importance_sampling_ratio/min": 0.5289757251739502, "sampling/sampling_logp_difference/max": 0.6368128061294556, "sampling/sampling_logp_difference/mean": 0.013697953894734383, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.4409083127975464, "epoch": 2.9767156862745097, "frac_reward_zero_std": 1.0, "grad_norm": 0.024087873197584663, "kl": 0.0422600694000721, "learning_rate": 2.684351445440547e-10, "loss": 0.0004, "num_tokens": 76652878.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5682049989700317, "sampling/importance_sampling_ratio/mean": 1.0005720853805542, "sampling/importance_sampling_ratio/min": 0.6955130696296692, "sampling/sampling_logp_difference/max": 0.44993162155151367, "sampling/sampling_logp_difference/mean": 0.015224607661366463, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 187.703125, "completions/mean_terminated_length": 187.703125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.47271329164505005, "epoch": 2.9779411764705883, "frac_reward_zero_std": 0.75, "grad_norm": 0.9832786884744623, "kl": 0.06448041647672653, "learning_rate": 2.456022665127122e-10, "loss": 0.0155, "num_tokens": 76689019.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.7292759418487549, "sampling/importance_sampling_ratio/mean": 1.0006279945373535, "sampling/importance_sampling_ratio/min": 0.7186931371688843, "sampling/sampling_logp_difference/max": 0.5477027893066406, "sampling/sampling_logp_difference/mean": 0.017432495951652527, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 188.703125, "completions/mean_terminated_length": 188.703125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3937327563762665, "epoch": 2.9791666666666665, "frac_reward_zero_std": 1.0, "grad_norm": 0.017894899872886165, "kl": 0.029064467176795006, "learning_rate": 2.2378385824833866e-10, "loss": 0.0003, "num_tokens": 76721544.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.588956594467163, "sampling/importance_sampling_ratio/mean": 1.0005124807357788, "sampling/importance_sampling_ratio/min": 0.6262632608413696, "sampling/sampling_logp_difference/max": 0.4679844379425049, "sampling/sampling_logp_difference/mean": 0.016251537948846817, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.43084716796875, "epoch": 2.980392156862745, "frac_reward_zero_std": 0.75, "grad_norm": 0.7980041187157573, "kl": 0.034850817173719406, "learning_rate": 2.0297996404095018e-10, "loss": 0.0219, "num_tokens": 76752128.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.5484894514083862, "sampling/importance_sampling_ratio/mean": 1.0004076957702637, "sampling/importance_sampling_ratio/min": 0.6262337565422058, "sampling/sampling_logp_difference/max": 0.468031644821167, "sampling/sampling_logp_difference/mean": 0.016249552369117737, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 209.078125, "completions/mean_terminated_length": 209.078125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.37195223569869995, "epoch": 2.9816176470588234, "frac_reward_zero_std": 1.0, "grad_norm": 0.018372951542397854, "kl": 0.022795135155320168, "learning_rate": 1.8319062612115467e-10, "loss": 0.0002, "num_tokens": 76783445.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4479608535766602, "sampling/importance_sampling_ratio/mean": 0.9999375343322754, "sampling/importance_sampling_ratio/min": 0.6661897301673889, "sampling/sampling_logp_difference/max": 0.406180739402771, "sampling/sampling_logp_difference/mean": 0.01377858780324459, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 195.15625, "completions/mean_terminated_length": 195.15625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3060455918312073, "epoch": 2.982843137254902, "frac_reward_zero_std": 1.0, "grad_norm": 0.013220137123882814, "kl": 0.020647821947932243, "learning_rate": 1.6441588466009627e-10, "loss": 0.0002, "num_tokens": 76812511.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.862382411956787, "sampling/importance_sampling_ratio/mean": 0.9997444152832031, "sampling/importance_sampling_ratio/min": 0.6732436418533325, "sampling/sampling_logp_difference/max": 0.6218565702438354, "sampling/sampling_logp_difference/mean": 0.013443908654153347, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.38746321201324463, "epoch": 2.9840686274509802, "frac_reward_zero_std": 1.0, "grad_norm": 0.040917264826652325, "kl": 0.06290300190448761, "learning_rate": 1.4665577776923343e-10, "loss": 0.0007, "num_tokens": 76844179.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6007455587387085, "sampling/importance_sampling_ratio/mean": 0.9994732737541199, "sampling/importance_sampling_ratio/min": 0.6137450933456421, "sampling/sampling_logp_difference/max": 0.488175630569458, "sampling/sampling_logp_difference/mean": 0.015084675513207912, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 180.890625, "completions/mean_terminated_length": 180.890625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.39325374364852905, "epoch": 2.985294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.026781616326749733, "kl": 0.04768332093954086, "learning_rate": 1.2991034150050538e-10, "loss": 0.0005, "num_tokens": 76871644.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6221674680709839, "sampling/importance_sampling_ratio/mean": 1.000661849975586, "sampling/importance_sampling_ratio/min": 0.6309764981269836, "sampling/sampling_logp_difference/max": 0.4837632179260254, "sampling/sampling_logp_difference/mean": 0.014678483828902245, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 230.140625, "completions/mean_terminated_length": 230.140625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3076133131980896, "epoch": 2.986519607843137, "frac_reward_zero_std": 1.0, "grad_norm": 0.022439207037943637, "kl": 0.027489451691508293, "learning_rate": 1.1417960984605457e-10, "loss": 0.0003, "num_tokens": 76902037.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6770782470703125, "sampling/importance_sampling_ratio/mean": 1.0006099939346313, "sampling/importance_sampling_ratio/min": 0.5068971514701843, "sampling/sampling_logp_difference/max": 0.6794471740722656, "sampling/sampling_logp_difference/mean": 0.013162676244974136, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 240.21875, "completions/mean_terminated_length": 240.21875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.33798423409461975, "epoch": 2.9877450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 0.6925080574839301, "kl": 0.02133062668144703, "learning_rate": 9.946361473822662e-11, "loss": 0.015, "num_tokens": 76936771.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.4189066886901855, "sampling/importance_sampling_ratio/mean": 1.0010058879852295, "sampling/importance_sampling_ratio/min": 0.6178327202796936, "sampling/sampling_logp_difference/max": 0.4815375804901123, "sampling/sampling_logp_difference/mean": 0.01256520114839077, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 260.65625, "completions/mean_terminated_length": 260.65625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3749733567237854, "epoch": 2.9889705882352944, "frac_reward_zero_std": 0.75, "grad_norm": 0.7822023535083346, "kl": 0.031120579689741135, "learning_rate": 8.576238604968144e-11, "loss": -0.0014, "num_tokens": 76973629.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4561256170272827, "sampling/importance_sampling_ratio/mean": 0.9998626112937927, "sampling/importance_sampling_ratio/min": 0.5609079599380493, "sampling/sampling_logp_difference/max": 0.5781984329223633, "sampling/sampling_logp_difference/mean": 0.013883860781788826, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 182.15625, "completions/mean_terminated_length": 182.15625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.36464837193489075, "epoch": 2.9901960784313726, "frac_reward_zero_std": 0.75, "grad_norm": 0.7809426881687429, "kl": 0.040025513619184494, "learning_rate": 7.307595159300461e-11, "loss": -0.0036, "num_tokens": 77005191.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4753769636154175, "sampling/importance_sampling_ratio/mean": 0.9999262094497681, "sampling/importance_sampling_ratio/min": 0.5693280696868896, "sampling/sampling_logp_difference/max": 0.5632984638214111, "sampling/sampling_logp_difference/mean": 0.014043103903532028, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 207.9375, "completions/mean_terminated_length": 207.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.4231107831001282, "epoch": 2.991421568627451, "frac_reward_zero_std": 1.0, "grad_norm": 0.022112389211604168, "kl": 0.029684830456972122, "learning_rate": 6.140433712076287e-11, "loss": 0.0003, "num_tokens": 77039411.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5629370212554932, "sampling/importance_sampling_ratio/mean": 1.0006803274154663, "sampling/importance_sampling_ratio/min": 0.6103478074073792, "sampling/sampling_logp_difference/max": 0.4937262535095215, "sampling/sampling_logp_difference/mean": 0.01552946213632822, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 237.328125, "completions/mean_terminated_length": 237.328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3809570372104645, "epoch": 2.9926470588235294, "frac_reward_zero_std": 0.5, "grad_norm": 0.9849094616363699, "kl": 0.06580809503793716, "learning_rate": 5.074756632572619e-11, "loss": -0.0084, "num_tokens": 77071656.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4351915121078491, "sampling/importance_sampling_ratio/mean": 0.999637246131897, "sampling/importance_sampling_ratio/min": 0.5690277218818665, "sampling/sampling_logp_difference/max": 0.5638261437416077, "sampling/sampling_logp_difference/mean": 0.013923577964305878, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 209.34375, "completions/mean_terminated_length": 209.34375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.40023350715637207, "epoch": 2.993872549019608, "frac_reward_zero_std": 0.75, "grad_norm": 0.9166245589220879, "kl": 0.02449093759059906, "learning_rate": 4.110566084036815e-11, "loss": -0.0108, "num_tokens": 77101838.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.649056315422058, "sampling/importance_sampling_ratio/mean": 0.9999035000801086, "sampling/importance_sampling_ratio/min": 0.6530977487564087, "sampling/sampling_logp_difference/max": 0.5002031326293945, "sampling/sampling_logp_difference/mean": 0.015382511541247368, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 207.734375, "completions/mean_terminated_length": 207.734375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.44583308696746826, "epoch": 2.9950980392156863, "frac_reward_zero_std": 1.0, "grad_norm": 0.03138498208642566, "kl": 0.0639105960726738, "learning_rate": 3.247864023719904e-11, "loss": 0.0006, "num_tokens": 77130973.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5488362312316895, "sampling/importance_sampling_ratio/mean": 0.9999371767044067, "sampling/importance_sampling_ratio/min": 0.6546967029571533, "sampling/sampling_logp_difference/max": 0.4375038146972656, "sampling/sampling_logp_difference/mean": 0.016407610848546028, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 158.953125, "completions/mean_terminated_length": 158.953125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3797130882740021, "epoch": 2.9963235294117645, "frac_reward_zero_std": 0.75, "grad_norm": 0.9006243160778002, "kl": 0.06772524863481522, "learning_rate": 2.4866522028488268e-11, "loss": 0.0037, "num_tokens": 77157978.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.424027919769287, "sampling/importance_sampling_ratio/mean": 0.999600887298584, "sampling/importance_sampling_ratio/min": 0.7135008573532104, "sampling/sampling_logp_difference/max": 0.35348939895629883, "sampling/sampling_logp_difference/mean": 0.015492400154471397, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 179.671875, "completions/mean_terminated_length": 179.671875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.44988203048706055, "epoch": 2.997549019607843, "frac_reward_zero_std": 1.0, "grad_norm": 0.04407334586845134, "kl": 0.05289856344461441, "learning_rate": 1.8269321666375403e-11, "loss": 0.0004, "num_tokens": 77186725.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4753925800323486, "sampling/importance_sampling_ratio/mean": 0.9999063611030579, "sampling/importance_sampling_ratio/min": 0.7180938720703125, "sampling/sampling_logp_difference/max": 0.38892412185668945, "sampling/sampling_logp_difference/mean": 0.017196040600538254, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 216.796875, "completions/mean_terminated_length": 216.796875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3979523181915283, "epoch": 2.998774509803922, "frac_reward_zero_std": 1.0, "grad_norm": 0.021870598023562535, "kl": 0.03111506626009941, "learning_rate": 1.2687052542759147e-11, "loss": 0.0003, "num_tokens": 77219448.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5071574449539185, "sampling/importance_sampling_ratio/mean": 0.9996896982192993, "sampling/importance_sampling_ratio/min": 0.574481725692749, "sampling/sampling_logp_difference/max": 0.5542869567871094, "sampling/sampling_logp_difference/mean": 0.015271512791514397, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 195.546875, "completions/mean_terminated_length": 195.546875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.35060396790504456, "epoch": 3.0, "frac_reward_zero_std": 1.0, "grad_norm": 0.02021418566622152, "kl": 0.025219064205884933, "learning_rate": 8.119725989241822e-12, "loss": 0.0002, "num_tokens": 77246587.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6122491359710693, "sampling/importance_sampling_ratio/mean": 1.0004022121429443, "sampling/importance_sampling_ratio/min": 0.6406969428062439, "sampling/sampling_logp_difference/max": 0.4776301383972168, "sampling/sampling_logp_difference/mean": 0.014867585152387619, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 207.796875, "completions/mean_terminated_length": 207.796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3712385594844818, "epoch": 3.0012254901960786, "frac_reward_zero_std": 0.5, "grad_norm": 1.3733932754087725, "kl": 0.030660737305879593, "learning_rate": 4.5673512772959055e-12, "loss": 0.0457, "num_tokens": 77281118.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998342990875244, "sampling/importance_sampling_ratio/min": 0.6906341314315796, "sampling/sampling_logp_difference/max": 0.7512912750244141, "sampling/sampling_logp_difference/mean": 0.014142373576760292, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 145.359375, "completions/mean_terminated_length": 145.359375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.4449942708015442, "epoch": 3.002450980392157, "frac_reward_zero_std": 0.75, "grad_norm": 1.020521614562299, "kl": 0.061585891991853714, "learning_rate": 2.0299356179309666e-12, "loss": 0.0156, "num_tokens": 77314037.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 0.9996541142463684, "sampling/importance_sampling_ratio/min": 0.6051085591316223, "sampling/sampling_logp_difference/max": 0.502347469329834, "sampling/sampling_logp_difference/mean": 0.01772310584783554, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.33885860443115234, "epoch": 3.0036764705882355, "frac_reward_zero_std": 1.0, "grad_norm": 0.018110718259315723, "kl": 0.02658119425177574, "learning_rate": 5.074841620267278e-13, "loss": 0.0003, "num_tokens": 77342493.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003312826156616, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.9316283464431763, "sampling/sampling_logp_difference/mean": 0.013981176540255547, "step": 2451 } ], "logging_steps": 1, "max_steps": 2451, "num_input_tokens_seen": 77342493, "num_train_epochs": 4, "save_steps": 817, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }