{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.024, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 10.53125, "completions/mean_terminated_length": 7.6666669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.239578485488892, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.11523175239562988, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0179, "num_tokens": 40500.0, "reward": 0.03224813938140869, "reward_std": 0.8015883564949036, "rewards/rollout_reward_func/mean": 0.03224813938140869, "rewards/rollout_reward_func/std": 0.8015883564949036, "sampling/importance_sampling_ratio/max": 0.4935462176799774, "sampling/importance_sampling_ratio/mean": 0.10224869847297668, "sampling/importance_sampling_ratio/min": 3.809343507299445e-09, "sampling/sampling_logp_difference/max": 2.3248469829559326, "sampling/sampling_logp_difference/mean": 0.6373271942138672, "step": 1, "step_time": 14.930843470996479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.239578485488892, "epoch": 4e-05, "grad_norm": 0.1207628846168518, "kl": 0.0, "learning_rate": 2.2857142857142855e-07, "loss": -0.0179, "step": 2, "step_time": 7.349735933006741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 11.21875, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.318372189998627, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.10787655413150787, "kl": 0.0003172830802213866, "learning_rate": 4.571428571428571e-07, "loss": -0.0172, "num_tokens": 82678.0, "reward": -0.19924837350845337, "reward_std": 0.8888105154037476, "rewards/rollout_reward_func/mean": -0.19924837350845337, "rewards/rollout_reward_func/std": 0.8888105154037476, "sampling/importance_sampling_ratio/max": 0.3840599060058594, "sampling/importance_sampling_ratio/mean": 0.07858596742153168, "sampling/importance_sampling_ratio/min": 5.863767045706514e-13, "sampling/sampling_logp_difference/max": 2.7227702140808105, "sampling/sampling_logp_difference/mean": 0.6937954425811768, "step": 3, "step_time": 14.549954476999119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.317917704582214, "epoch": 8e-05, "grad_norm": 0.10515999048948288, "kl": 0.00035806918458547443, "learning_rate": 6.857142857142857e-07, "loss": -0.017, "step": 4, "step_time": 7.3574429460277315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 11.59375, "completions/mean_terminated_length": 7.705882549285889, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.435825228691101, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 0.07017651945352554, "kl": 0.00033981550950556993, "learning_rate": 9.142857142857142e-07, "loss": -0.011, "num_tokens": 124200.0, "reward": -0.21212950348854065, "reward_std": 0.7807229161262512, "rewards/rollout_reward_func/mean": -0.21212950348854065, "rewards/rollout_reward_func/std": 0.7807229161262512, "sampling/importance_sampling_ratio/max": 0.3732198178768158, "sampling/importance_sampling_ratio/mean": 0.07304741442203522, "sampling/importance_sampling_ratio/min": 6.042922828797259e-10, "sampling/sampling_logp_difference/max": 2.372177839279175, "sampling/sampling_logp_difference/mean": 0.7019559741020203, "step": 5, "step_time": 14.635353485035012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.435489535331726, "epoch": 0.00012, "grad_norm": 0.06887838989496231, "kl": 0.00029581049602711573, "learning_rate": 1.1428571428571428e-06, "loss": -0.011, "step": 6, "step_time": 7.3584295180335175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 6.38095235824585, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 4.049726366996765, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 0.09409166127443314, "kl": 0.0002636039425851777, "learning_rate": 1.3714285714285715e-06, "loss": -0.0101, "num_tokens": 164711.0, "reward": 0.07138202339410782, "reward_std": 0.9087281227111816, "rewards/rollout_reward_func/mean": 0.07138202339410782, "rewards/rollout_reward_func/std": 0.9087281227111816, "sampling/importance_sampling_ratio/max": 0.9864997267723083, "sampling/importance_sampling_ratio/mean": 0.14362046122550964, "sampling/importance_sampling_ratio/min": 3.2099833902066166e-07, "sampling/sampling_logp_difference/max": 1.7980015277862549, "sampling/sampling_logp_difference/mean": 0.5813089609146118, "step": 7, "step_time": 14.35529181000311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.049126625061035, "epoch": 0.00016, "grad_norm": 0.08627655357122421, "kl": 0.00029194985108915716, "learning_rate": 1.6e-06, "loss": -0.0103, "step": 8, "step_time": 8.495267413993133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.9375, "completions/mean_terminated_length": 6.300000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.079105854034424, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.10413724929094315, "kl": 0.00027581091489992104, "learning_rate": 1.8285714285714284e-06, "loss": -0.0145, "num_tokens": 207065.0, "reward": 0.17951779067516327, "reward_std": 0.7891321778297424, "rewards/rollout_reward_func/mean": 0.17951779067516327, "rewards/rollout_reward_func/std": 0.7891321182250977, "sampling/importance_sampling_ratio/max": 0.47657299041748047, "sampling/importance_sampling_ratio/mean": 0.11805976927280426, "sampling/importance_sampling_ratio/min": 9.131189138145146e-09, "sampling/sampling_logp_difference/max": 2.1733293533325195, "sampling/sampling_logp_difference/mean": 0.6176249980926514, "step": 9, "step_time": 14.524425113981124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.077738642692566, "epoch": 0.0002, "grad_norm": 0.10270091891288757, "kl": 0.00031047324955579825, "learning_rate": 2.057142857142857e-06, "loss": -0.0146, "step": 10, "step_time": 7.5032295949349646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 9.21875, "completions/mean_terminated_length": 6.136363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.01254665851593, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 0.09566973894834518, "kl": 0.0004022720459033735, "learning_rate": 2.2857142857142856e-06, "loss": -0.0139, "num_tokens": 248927.0, "reward": 0.01580996811389923, "reward_std": 0.8122813701629639, "rewards/rollout_reward_func/mean": 0.01580996811389923, "rewards/rollout_reward_func/std": 0.8122813701629639, "sampling/importance_sampling_ratio/max": 0.592956006526947, "sampling/importance_sampling_ratio/mean": 0.14927938580513, "sampling/importance_sampling_ratio/min": 3.941320880773702e-10, "sampling/sampling_logp_difference/max": 2.404111385345459, "sampling/sampling_logp_difference/mean": 0.6090236902236938, "step": 11, "step_time": 14.534383864986012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.007008671760559, "epoch": 0.00024, "grad_norm": 0.0953749492764473, "kl": 0.0005712400452466682, "learning_rate": 2.5142857142857142e-06, "loss": -0.0139, "step": 12, "step_time": 7.424258839950198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 11.21875, "completions/mean_terminated_length": 6.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.917647957801819, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 0.06092236563563347, "kl": 0.0004598381638061255, "learning_rate": 2.742857142857143e-06, "loss": -0.0073, "num_tokens": 290881.0, "reward": -0.15022556483745575, "reward_std": 0.8482059836387634, "rewards/rollout_reward_func/mean": -0.15022556483745575, "rewards/rollout_reward_func/std": 0.8482059240341187, "sampling/importance_sampling_ratio/max": 0.7298710346221924, "sampling/importance_sampling_ratio/mean": 0.10054843127727509, "sampling/importance_sampling_ratio/min": 8.915451132907037e-08, "sampling/sampling_logp_difference/max": 2.1249516010284424, "sampling/sampling_logp_difference/mean": 0.5704424381256104, "step": 13, "step_time": 14.759364509023726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9158891439437866, "epoch": 0.00028, "grad_norm": 0.06287423521280289, "kl": 0.0006289545999607071, "learning_rate": 2.9714285714285716e-06, "loss": -0.0073, "step": 14, "step_time": 7.383632490993477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 10.5625, "completions/mean_terminated_length": 6.333333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.192338287830353, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 0.06778133660554886, "kl": 0.0007377296860795468, "learning_rate": 3.2e-06, "loss": -0.0102, "num_tokens": 334405.0, "reward": -0.07384219765663147, "reward_std": 0.8479647636413574, "rewards/rollout_reward_func/mean": -0.07384219765663147, "rewards/rollout_reward_func/std": 0.8479647636413574, "sampling/importance_sampling_ratio/max": 0.39594781398773193, "sampling/importance_sampling_ratio/mean": 0.10623657703399658, "sampling/importance_sampling_ratio/min": 1.4451283725236408e-08, "sampling/sampling_logp_difference/max": 2.3065097332000732, "sampling/sampling_logp_difference/mean": 0.6380860805511475, "step": 15, "step_time": 14.589917394012446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.190466940402985, "epoch": 0.00032, "grad_norm": 0.06782454997301102, "kl": 0.0008779604977462441, "learning_rate": 3.428571428571428e-06, "loss": -0.0103, "step": 16, "step_time": 7.46608664598898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 10.21875, "completions/mean_terminated_length": 6.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.331807255744934, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 0.10327531397342682, "kl": 0.0014278539165388793, "learning_rate": 3.657142857142857e-06, "loss": -0.0168, "num_tokens": 377462.0, "reward": 0.005396826192736626, "reward_std": 0.8053250312805176, "rewards/rollout_reward_func/mean": 0.005396826192736626, "rewards/rollout_reward_func/std": 0.8053250312805176, "sampling/importance_sampling_ratio/max": 0.5050538778305054, "sampling/importance_sampling_ratio/mean": 0.13272428512573242, "sampling/importance_sampling_ratio/min": 1.2730974496832914e-08, "sampling/sampling_logp_difference/max": 2.2027411460876465, "sampling/sampling_logp_difference/mean": 0.6466052532196045, "step": 17, "step_time": 14.41751138100517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.328067183494568, "epoch": 0.00036, "grad_norm": 0.10751788318157196, "kl": 0.001955893822014332, "learning_rate": 3.885714285714286e-06, "loss": -0.0169, "step": 18, "step_time": 8.15666441299254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 5.608695983886719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.961883544921875, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 0.09506737440824509, "kl": 0.004055569646880031, "learning_rate": 4.114285714285714e-06, "loss": -0.0175, "num_tokens": 418295.0, "reward": 0.046298250555992126, "reward_std": 0.8541479706764221, "rewards/rollout_reward_func/mean": 0.046298250555992126, "rewards/rollout_reward_func/std": 0.8541479706764221, "sampling/importance_sampling_ratio/max": 0.49439045786857605, "sampling/importance_sampling_ratio/mean": 0.18179430067539215, "sampling/importance_sampling_ratio/min": 1.1001319855097336e-08, "sampling/sampling_logp_difference/max": 2.2957816123962402, "sampling/sampling_logp_difference/mean": 0.5884032249450684, "step": 19, "step_time": 13.933068175014341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.9484644532203674, "epoch": 0.0004, "grad_norm": 0.09595756977796555, "kl": 0.005810118746012449, "learning_rate": 4.342857142857142e-06, "loss": -0.0179, "step": 20, "step_time": 7.3871981180273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 5.26086950302124, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.1399001479148865, "epoch": 0.00042, "frac_reward_zero_std": 0.125, "grad_norm": 0.0982261449098587, "kl": 0.008757472038269043, "learning_rate": 4.571428571428571e-06, "loss": -0.019, "num_tokens": 459184.0, "reward": 0.06227298080921173, "reward_std": 0.8516911268234253, "rewards/rollout_reward_func/mean": 0.06227298080921173, "rewards/rollout_reward_func/std": 0.8516910672187805, "sampling/importance_sampling_ratio/max": 0.5965237617492676, "sampling/importance_sampling_ratio/mean": 0.18625321984291077, "sampling/importance_sampling_ratio/min": 2.1200602384752187e-10, "sampling/sampling_logp_difference/max": 3.305255889892578, "sampling/sampling_logp_difference/mean": 0.6594976186752319, "step": 21, "step_time": 13.905976124980953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.122564077377319, "epoch": 0.00044, "grad_norm": 0.0960918441414833, "kl": 0.012550872517749667, "learning_rate": 4.8e-06, "loss": -0.0196, "step": 22, "step_time": 7.360382560000289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.90625, "completions/mean_terminated_length": 7.136363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 4.060932278633118, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 0.07630966603755951, "kl": 0.01600770093500614, "learning_rate": 5.0285714285714285e-06, "loss": 0.0007, "num_tokens": 502194.0, "reward": 0.03604554384946823, "reward_std": 0.7266662120819092, "rewards/rollout_reward_func/mean": 0.03604554384946823, "rewards/rollout_reward_func/std": 0.7266662120819092, "sampling/importance_sampling_ratio/max": 0.6753280162811279, "sampling/importance_sampling_ratio/mean": 0.13074152171611786, "sampling/importance_sampling_ratio/min": 1.677815220091361e-07, "sampling/sampling_logp_difference/max": 2.1159329414367676, "sampling/sampling_logp_difference/mean": 0.5957749485969543, "step": 23, "step_time": 14.589931307011284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.044048726558685, "epoch": 0.00048, "grad_norm": 0.07775864750146866, "kl": 0.022890843451023102, "learning_rate": 5.257142857142857e-06, "loss": 0.0006, "step": 24, "step_time": 7.4191237640043255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.8125, "completions/mean_terminated_length": 6.799999713897705, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.7556194067001343, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 0.09937179833650589, "kl": 0.027981107123196125, "learning_rate": 5.485714285714286e-06, "loss": -0.021, "num_tokens": 540960.0, "reward": -0.12019962817430496, "reward_std": 0.7724800109863281, "rewards/rollout_reward_func/mean": -0.12019962817430496, "rewards/rollout_reward_func/std": 0.7724799513816833, "sampling/importance_sampling_ratio/max": 0.6373160481452942, "sampling/importance_sampling_ratio/mean": 0.2226800173521042, "sampling/importance_sampling_ratio/min": 1.1813025002638256e-09, "sampling/sampling_logp_difference/max": 2.6536407470703125, "sampling/sampling_logp_difference/mean": 0.5918028354644775, "step": 25, "step_time": 14.047980981995352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.72335147857666, "epoch": 0.00052, "grad_norm": 0.09555613994598389, "kl": 0.03501995326951146, "learning_rate": 5.7142857142857145e-06, "loss": -0.0216, "step": 26, "step_time": 7.322929550980916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 6.269230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.9938231706619263, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 0.11543671786785126, "kl": 0.06253800448030233, "learning_rate": 5.942857142857143e-06, "loss": -0.0249, "num_tokens": 582851.0, "reward": 0.19693678617477417, "reward_std": 0.8555697202682495, "rewards/rollout_reward_func/mean": 0.19693678617477417, "rewards/rollout_reward_func/std": 0.8555696606636047, "sampling/importance_sampling_ratio/max": 0.7905201315879822, "sampling/importance_sampling_ratio/mean": 0.29346346855163574, "sampling/importance_sampling_ratio/min": 1.2449618592980105e-11, "sampling/sampling_logp_difference/max": 2.5624756813049316, "sampling/sampling_logp_difference/mean": 0.6281294822692871, "step": 27, "step_time": 14.967007311992347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00972250779159367, "clip_ratio/low_min": 0.005681818351149559, "clip_ratio/region_mean": 0.00972250779159367, "entropy": 3.955626606941223, "epoch": 0.00056, "grad_norm": 0.10730621963739395, "kl": 0.08828557468950748, "learning_rate": 6.171428571428571e-06, "loss": -0.0258, "step": 28, "step_time": 7.378870050015394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 6.519999980926514, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.4480834007263184, "epoch": 0.00058, "frac_reward_zero_std": 0.125, "grad_norm": 0.10864230990409851, "kl": 0.09903974086046219, "learning_rate": 6.4e-06, "loss": -0.0293, "num_tokens": 624824.0, "reward": 0.051831603050231934, "reward_std": 0.7311508059501648, "rewards/rollout_reward_func/mean": 0.051831603050231934, "rewards/rollout_reward_func/std": 0.73115074634552, "sampling/importance_sampling_ratio/max": 0.8744103312492371, "sampling/importance_sampling_ratio/mean": 0.30598098039627075, "sampling/importance_sampling_ratio/min": 5.653572543451446e-07, "sampling/sampling_logp_difference/max": 2.0796260833740234, "sampling/sampling_logp_difference/mean": 0.5294226408004761, "step": 29, "step_time": 13.981504588999087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.027777166571468115, "clip_ratio/low_min": 0.00657894741743803, "clip_ratio/region_mean": 0.027777166571468115, "entropy": 3.3702672719955444, "epoch": 0.0006, "grad_norm": 0.10335154086351395, "kl": 0.1588730551302433, "learning_rate": 6.628571428571428e-06, "loss": -0.0303, "step": 30, "step_time": 7.380970048048766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 6.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.3536065816879272, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.19071388244628906, "kl": 0.2171953320503235, "learning_rate": 6.857142857142856e-06, "loss": -0.0434, "num_tokens": 666084.0, "reward": 0.1380191445350647, "reward_std": 0.7973575592041016, "rewards/rollout_reward_func/mean": 0.1380191445350647, "rewards/rollout_reward_func/std": 0.7973574995994568, "sampling/importance_sampling_ratio/max": 0.9698997735977173, "sampling/importance_sampling_ratio/mean": 0.37852537631988525, "sampling/importance_sampling_ratio/min": 8.64331828331899e-10, "sampling/sampling_logp_difference/max": 2.4565038681030273, "sampling/sampling_logp_difference/mean": 0.5556465983390808, "step": 31, "step_time": 13.676230999990366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02225102367810905, "clip_ratio/low_min": 0.008064515888690948, "clip_ratio/region_mean": 0.02225102367810905, "entropy": 3.283941924571991, "epoch": 0.00064, "grad_norm": 0.20110467076301575, "kl": 0.3190930299460888, "learning_rate": 7.085714285714285e-06, "loss": -0.0443, "step": 32, "step_time": 7.3666894489724655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.259259223937988, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.1072129011154175, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 0.157863050699234, "kl": 0.39547519385814667, "learning_rate": 7.314285714285714e-06, "loss": -0.034, "num_tokens": 708074.0, "reward": 0.2271644026041031, "reward_std": 0.7089215517044067, "rewards/rollout_reward_func/mean": 0.2271644026041031, "rewards/rollout_reward_func/std": 0.7089215517044067, "sampling/importance_sampling_ratio/max": 1.0098618268966675, "sampling/importance_sampling_ratio/mean": 0.5146302580833435, "sampling/importance_sampling_ratio/min": 2.9368875331670097e-08, "sampling/sampling_logp_difference/max": 2.1174771785736084, "sampling/sampling_logp_difference/mean": 0.5028680562973022, "step": 33, "step_time": 13.700672397972085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 3.0394179224967957, "epoch": 0.00068, "grad_norm": 0.15630494058132172, "kl": 0.48146650940179825, "learning_rate": 7.542857142857142e-06, "loss": -0.0353, "step": 34, "step_time": 7.364857595966896 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.185185432434082, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.989242732524872, "epoch": 0.0007, "frac_reward_zero_std": 0.125, "grad_norm": 0.12738513946533203, "kl": 0.6992727071046829, "learning_rate": 7.771428571428572e-06, "loss": -0.0284, "num_tokens": 751183.0, "reward": 0.5733144283294678, "reward_std": 0.6753550171852112, "rewards/rollout_reward_func/mean": 0.5733144283294678, "rewards/rollout_reward_func/std": 0.6753550171852112, "sampling/importance_sampling_ratio/max": 1.1152303218841553, "sampling/importance_sampling_ratio/mean": 0.6123095750808716, "sampling/importance_sampling_ratio/min": 1.8486706609976977e-09, "sampling/sampling_logp_difference/max": 2.519895076751709, "sampling/sampling_logp_difference/mean": 0.507644534111023, "step": 35, "step_time": 13.538134431029903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 2.9144036769866943, "epoch": 0.00072, "grad_norm": 0.12730813026428223, "kl": 0.7854489609599113, "learning_rate": 8e-06, "loss": -0.0292, "step": 36, "step_time": 7.42629113199655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.862069129943848, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4031880497932434, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 0.19821125268936157, "kl": 0.47106604278087616, "learning_rate": 7.999999998518522e-06, "loss": -0.0407, "num_tokens": 793369.0, "reward": 0.3375978171825409, "reward_std": 0.76048344373703, "rewards/rollout_reward_func/mean": 0.3375978171825409, "rewards/rollout_reward_func/std": 0.7604835033416748, "sampling/importance_sampling_ratio/max": 1.2729811668395996, "sampling/importance_sampling_ratio/mean": 0.6324254274368286, "sampling/importance_sampling_ratio/min": 7.439131255448928e-11, "sampling/sampling_logp_difference/max": 2.379329204559326, "sampling/sampling_logp_difference/mean": 0.46697747707366943, "step": 37, "step_time": 14.730412610020721 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.011254789307713509, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015719075221568346, "entropy": 2.314267724752426, "epoch": 0.00076, "grad_norm": 0.15749329328536987, "kl": 0.5413542352616787, "learning_rate": 7.99999999407409e-06, "loss": -0.0426, "step": 38, "step_time": 7.384043606027262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3266243636608124, "epoch": 0.00078, "frac_reward_zero_std": 0.25, "grad_norm": 0.146989643573761, "kl": 0.7227168083190918, "learning_rate": 7.999999986666703e-06, "loss": -0.046, "num_tokens": 835192.0, "reward": 0.5053051710128784, "reward_std": 0.8416069149971008, "rewards/rollout_reward_func/mean": 0.5053051710128784, "rewards/rollout_reward_func/std": 0.841606855392456, "sampling/importance_sampling_ratio/max": 1.4656599760055542, "sampling/importance_sampling_ratio/mean": 0.8146834373474121, "sampling/importance_sampling_ratio/min": 4.0769160269960025e-10, "sampling/sampling_logp_difference/max": 2.6081314086914062, "sampling/sampling_logp_difference/mean": 0.45661020278930664, "step": 39, "step_time": 13.485589152987814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017361111473292112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017361111473292112, "entropy": 2.2550963163375854, "epoch": 0.0008, "grad_norm": 0.1183193027973175, "kl": 0.720498263835907, "learning_rate": 7.99999997629636e-06, "loss": -0.0478, "step": 40, "step_time": 7.378542915015714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 5.3214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.600409686565399, "epoch": 0.00082, "frac_reward_zero_std": 0.25, "grad_norm": 0.14211063086986542, "kl": 0.8711760342121124, "learning_rate": 7.999999962963062e-06, "loss": -0.0361, "num_tokens": 875229.0, "reward": 0.2284783571958542, "reward_std": 0.7835679650306702, "rewards/rollout_reward_func/mean": 0.2284783571958542, "rewards/rollout_reward_func/std": 0.7835679054260254, "sampling/importance_sampling_ratio/max": 1.542662501335144, "sampling/importance_sampling_ratio/mean": 0.637143611907959, "sampling/importance_sampling_ratio/min": 1.9522476080169326e-08, "sampling/sampling_logp_difference/max": 2.3506693840026855, "sampling/sampling_logp_difference/mean": 0.4926232099533081, "step": 41, "step_time": 13.348916594986804 }, { "clip_ratio/high_max": 0.007575757801532745, "clip_ratio/high_mean": 0.0037878789007663727, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037878789007663727, "entropy": 2.5537906885147095, "epoch": 0.00084, "grad_norm": 0.1251022070646286, "kl": 0.7510461360216141, "learning_rate": 7.999999946666809e-06, "loss": -0.0371, "step": 42, "step_time": 7.196177757985424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 3.7931034564971924, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6446278393268585, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 0.17659996449947357, "kl": 0.358805775642395, "learning_rate": 7.999999927407602e-06, "loss": -0.0234, "num_tokens": 916136.0, "reward": 0.5156426429748535, "reward_std": 0.7647449970245361, "rewards/rollout_reward_func/mean": 0.5156426429748535, "rewards/rollout_reward_func/std": 0.7647449970245361, "sampling/importance_sampling_ratio/max": 1.83917236328125, "sampling/importance_sampling_ratio/mean": 0.9725204110145569, "sampling/importance_sampling_ratio/min": 1.1565389002043958e-07, "sampling/sampling_logp_difference/max": 2.105419397354126, "sampling/sampling_logp_difference/mean": 0.3761644959449768, "step": 43, "step_time": 13.369177257991396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.614312618970871, "epoch": 0.00088, "grad_norm": 0.18405361473560333, "kl": 0.32504790276288986, "learning_rate": 7.99999990518544e-06, "loss": -0.0242, "step": 44, "step_time": 7.405664066958707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.826163589954376, "epoch": 0.0009, "frac_reward_zero_std": 0.125, "grad_norm": 0.31924188137054443, "kl": 0.3498934954404831, "learning_rate": 7.999999880000322e-06, "loss": -0.0352, "num_tokens": 956399.0, "reward": 0.3180525302886963, "reward_std": 0.91302090883255, "rewards/rollout_reward_func/mean": 0.3180525302886963, "rewards/rollout_reward_func/std": 0.91302090883255, "sampling/importance_sampling_ratio/max": 1.6881080865859985, "sampling/importance_sampling_ratio/mean": 0.7422511577606201, "sampling/importance_sampling_ratio/min": 7.248856626063116e-09, "sampling/sampling_logp_difference/max": 2.0882694721221924, "sampling/sampling_logp_difference/mean": 0.5441356897354126, "step": 45, "step_time": 13.74076682300074 }, { "clip_ratio/high_max": 0.04017094150185585, "clip_ratio/high_mean": 0.02471510088071227, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033048434648662806, "entropy": 2.8052492141723633, "epoch": 0.00092, "grad_norm": 0.16278061270713806, "kl": 0.33164501935243607, "learning_rate": 7.99999985185225e-06, "loss": -0.0368, "step": 46, "step_time": 8.373428152030101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 3.879999876022339, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.289114385843277, "epoch": 0.00094, "frac_reward_zero_std": 0.125, "grad_norm": 0.11968354880809784, "kl": 0.3395223766565323, "learning_rate": 7.999999820741223e-06, "loss": -0.0743, "num_tokens": 997579.0, "reward": 0.47462064027786255, "reward_std": 0.8661351203918457, "rewards/rollout_reward_func/mean": 0.47462064027786255, "rewards/rollout_reward_func/std": 0.8661350607872009, "sampling/importance_sampling_ratio/max": 1.8018475770950317, "sampling/importance_sampling_ratio/mean": 0.8407039642333984, "sampling/importance_sampling_ratio/min": 8.5393876361195e-06, "sampling/sampling_logp_difference/max": 1.7550199031829834, "sampling/sampling_logp_difference/mean": 0.45495498180389404, "step": 47, "step_time": 13.653110896004364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2745291590690613, "epoch": 0.00096, "grad_norm": 0.1157274842262268, "kl": 0.3404731899499893, "learning_rate": 7.99999978666724e-06, "loss": -0.0748, "step": 48, "step_time": 7.408019178023096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.505393862724304, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 0.22574913501739502, "kl": 0.4851592741906643, "learning_rate": 7.999999749630303e-06, "loss": -0.0879, "num_tokens": 1037945.0, "reward": 0.348884642124176, "reward_std": 0.8469893336296082, "rewards/rollout_reward_func/mean": 0.348884642124176, "rewards/rollout_reward_func/std": 0.8469892740249634, "sampling/importance_sampling_ratio/max": 1.742828369140625, "sampling/importance_sampling_ratio/mean": 0.725777804851532, "sampling/importance_sampling_ratio/min": 2.3104075597135676e-10, "sampling/sampling_logp_difference/max": 2.4751367568969727, "sampling/sampling_logp_difference/mean": 0.5477296113967896, "step": 49, "step_time": 13.797736668027937 }, { "clip_ratio/high_max": 0.023157894611358643, "clip_ratio/high_mean": 0.011578947305679321, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011578947305679321, "entropy": 2.4853972494602203, "epoch": 0.001, "grad_norm": 0.20464913547039032, "kl": 0.5774480290710926, "learning_rate": 7.999999709630412e-06, "loss": -0.0892, "step": 50, "step_time": 7.326665709959343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.247926354408264, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 0.3912505805492401, "kl": 0.3444211333990097, "learning_rate": 7.999999666667564e-06, "loss": -0.0389, "num_tokens": 1080867.0, "reward": 0.4916535019874573, "reward_std": 0.7971339225769043, "rewards/rollout_reward_func/mean": 0.4916535019874573, "rewards/rollout_reward_func/std": 0.7971339225769043, "sampling/importance_sampling_ratio/max": 1.796090006828308, "sampling/importance_sampling_ratio/mean": 0.7128292322158813, "sampling/importance_sampling_ratio/min": 6.55779922453803e-06, "sampling/sampling_logp_difference/max": 2.042170524597168, "sampling/sampling_logp_difference/mean": 0.44086456298828125, "step": 51, "step_time": 13.771250004007015 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015224359929561615, "entropy": 2.228363484144211, "epoch": 0.00104, "grad_norm": 0.18221323192119598, "kl": 0.38617198169231415, "learning_rate": 7.999999620741765e-06, "loss": -0.0408, "step": 52, "step_time": 7.396358063007938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9253461509943008, "epoch": 0.00106, "frac_reward_zero_std": 0.125, "grad_norm": 0.3156380355358124, "kl": 1.4818039238452911, "learning_rate": 7.999999571853009e-06, "loss": -0.0845, "num_tokens": 1120770.0, "reward": 0.5039578676223755, "reward_std": 0.8306827545166016, "rewards/rollout_reward_func/mean": 0.5039578676223755, "rewards/rollout_reward_func/std": 0.8306827545166016, "sampling/importance_sampling_ratio/max": 2.147737979888916, "sampling/importance_sampling_ratio/mean": 0.9148068428039551, "sampling/importance_sampling_ratio/min": 8.530739670220555e-09, "sampling/sampling_logp_difference/max": 3.2009634971618652, "sampling/sampling_logp_difference/mean": 0.45561182498931885, "step": 53, "step_time": 13.438114476011833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.9021066054701805, "epoch": 0.00108, "grad_norm": 0.21955159306526184, "kl": 1.142692893743515, "learning_rate": 7.999999520001299e-06, "loss": -0.0864, "step": 54, "step_time": 7.352619911020156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 3.6875, "completions/mean_terminated_length": 3.2903225421905518, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6857795268297195, "epoch": 0.0011, "frac_reward_zero_std": 0.375, "grad_norm": 0.11601017415523529, "kl": 0.9064064919948578, "learning_rate": 7.999999465186634e-06, "loss": -0.0527, "num_tokens": 1162929.0, "reward": 0.7344857454299927, "reward_std": 0.6417108774185181, "rewards/rollout_reward_func/mean": 0.7344857454299927, "rewards/rollout_reward_func/std": 0.6417108774185181, "sampling/importance_sampling_ratio/max": 2.003490924835205, "sampling/importance_sampling_ratio/mean": 1.0385689735412598, "sampling/importance_sampling_ratio/min": 0.00039623776683583856, "sampling/sampling_logp_difference/max": 1.9654433727264404, "sampling/sampling_logp_difference/mean": 0.2630966901779175, "step": 55, "step_time": 13.17217662098119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.022664836142212152, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022664836142212152, "entropy": 0.6707632690668106, "epoch": 0.00112, "grad_norm": 0.07254873216152191, "kl": 1.0190379098057747, "learning_rate": 7.999999407409014e-06, "loss": -0.0532, "step": 56, "step_time": 8.639654350990895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2351385578513145, "epoch": 0.00114, "frac_reward_zero_std": 0.125, "grad_norm": 0.22327664494514465, "kl": 0.4233052060008049, "learning_rate": 7.99999934666844e-06, "loss": -0.0443, "num_tokens": 1203173.0, "reward": 0.47592172026634216, "reward_std": 0.7835609912872314, "rewards/rollout_reward_func/mean": 0.47592172026634216, "rewards/rollout_reward_func/std": 0.7835609912872314, "sampling/importance_sampling_ratio/max": 2.5275075435638428, "sampling/importance_sampling_ratio/mean": 0.9247205257415771, "sampling/importance_sampling_ratio/min": 6.391328355448422e-08, "sampling/sampling_logp_difference/max": 2.136176824569702, "sampling/sampling_logp_difference/mean": 0.37280556559562683, "step": 57, "step_time": 13.359397703956347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01759259309619665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01759259309619665, "entropy": 1.2289419695734978, "epoch": 0.00116, "grad_norm": 0.11674747616052628, "kl": 0.45326171070337296, "learning_rate": 7.999999282964912e-06, "loss": -0.045, "step": 58, "step_time": 7.378293102985481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2536525949835777, "epoch": 0.00118, "frac_reward_zero_std": 0.25, "grad_norm": 0.390070378780365, "kl": 1.9384917467832565, "learning_rate": 7.999999216298429e-06, "loss": -0.0074, "num_tokens": 1244926.0, "reward": 0.5129584670066833, "reward_std": 0.74980628490448, "rewards/rollout_reward_func/mean": 0.5129584670066833, "rewards/rollout_reward_func/std": 0.74980628490448, "sampling/importance_sampling_ratio/max": 1.6071522235870361, "sampling/importance_sampling_ratio/mean": 0.9047443866729736, "sampling/importance_sampling_ratio/min": 8.886310752131976e-06, "sampling/sampling_logp_difference/max": 3.0023913383483887, "sampling/sampling_logp_difference/mean": 0.371391236782074, "step": 59, "step_time": 13.368499253003392 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2609479650855064, "epoch": 0.0012, "grad_norm": 0.2602469027042389, "kl": 1.3569837808609009, "learning_rate": 7.999999146668991e-06, "loss": -0.01, "step": 60, "step_time": 7.403321266028797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.5217390060424805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.601012408733368, "epoch": 0.00122, "frac_reward_zero_std": 0.25, "grad_norm": 0.3636488616466522, "kl": 0.9166570156812668, "learning_rate": 7.999999074076601e-06, "loss": -0.0495, "num_tokens": 1284686.0, "reward": 0.25745058059692383, "reward_std": 0.8323983550071716, "rewards/rollout_reward_func/mean": 0.25745058059692383, "rewards/rollout_reward_func/std": 0.8323983550071716, "sampling/importance_sampling_ratio/max": 2.095696449279785, "sampling/importance_sampling_ratio/mean": 0.6459202170372009, "sampling/importance_sampling_ratio/min": 4.22397476995684e-08, "sampling/sampling_logp_difference/max": 2.4657387733459473, "sampling/sampling_logp_difference/mean": 0.5323458909988403, "step": 61, "step_time": 13.808123469993006 }, { "clip_ratio/high_max": 0.015686274971812963, "clip_ratio/high_mean": 0.007843137485906482, "clip_ratio/low_mean": 0.0032051282469183207, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011048265732824802, "entropy": 2.6023473739624023, "epoch": 0.00124, "grad_norm": 0.1417587846517563, "kl": 0.7798821032047272, "learning_rate": 7.999998998521257e-06, "loss": -0.0517, "step": 62, "step_time": 7.333340539975325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5762333273887634, "epoch": 0.00126, "frac_reward_zero_std": 0.25, "grad_norm": 0.18342533707618713, "kl": 0.5429879575967789, "learning_rate": 7.999998920002956e-06, "loss": -0.0778, "num_tokens": 1322221.0, "reward": 0.7591686248779297, "reward_std": 0.8864517211914062, "rewards/rollout_reward_func/mean": 0.7591686248779297, "rewards/rollout_reward_func/std": 0.8864517211914062, "sampling/importance_sampling_ratio/max": 2.6039817333221436, "sampling/importance_sampling_ratio/mean": 1.111398458480835, "sampling/importance_sampling_ratio/min": 2.7073170372204913e-08, "sampling/sampling_logp_difference/max": 2.0496113300323486, "sampling/sampling_logp_difference/mean": 0.37554222345352173, "step": 63, "step_time": 13.450786286004586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5923978090286255, "epoch": 0.00128, "grad_norm": 0.18269728124141693, "kl": 0.5429944768548012, "learning_rate": 7.999998838521705e-06, "loss": -0.0776, "step": 64, "step_time": 7.302083529997617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.406349867582321, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 0.2029465138912201, "kl": 0.8324829488992691, "learning_rate": 7.999998754077496e-06, "loss": -0.0814, "num_tokens": 1365013.0, "reward": 0.4067034423351288, "reward_std": 0.7602794766426086, "rewards/rollout_reward_func/mean": 0.4067034423351288, "rewards/rollout_reward_func/std": 0.7602794766426086, "sampling/importance_sampling_ratio/max": 1.561341643333435, "sampling/importance_sampling_ratio/mean": 0.7285850048065186, "sampling/importance_sampling_ratio/min": 7.717858352407347e-06, "sampling/sampling_logp_difference/max": 1.978499412536621, "sampling/sampling_logp_difference/mean": 0.45376503467559814, "step": 65, "step_time": 14.492501834989525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.412062019109726, "epoch": 0.00132, "grad_norm": 0.22637835144996643, "kl": 0.8126420080661774, "learning_rate": 7.999998666670336e-06, "loss": -0.0814, "step": 66, "step_time": 7.969802140985848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1773951947689056, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 0.19502712786197662, "kl": 0.4516945779323578, "learning_rate": 7.999998576300222e-06, "loss": -0.0654, "num_tokens": 1406196.0, "reward": 0.5655401349067688, "reward_std": 0.7595866322517395, "rewards/rollout_reward_func/mean": 0.5655401349067688, "rewards/rollout_reward_func/std": 0.7595865726470947, "sampling/importance_sampling_ratio/max": 2.4225339889526367, "sampling/importance_sampling_ratio/mean": 0.8957334160804749, "sampling/importance_sampling_ratio/min": 4.74252970228406e-10, "sampling/sampling_logp_difference/max": 2.3079490661621094, "sampling/sampling_logp_difference/mean": 0.4553307592868805, "step": 67, "step_time": 13.650183651014231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.182311475276947, "epoch": 0.00136, "grad_norm": 0.19204139709472656, "kl": 0.4527333155274391, "learning_rate": 7.999998482967154e-06, "loss": -0.0658, "step": 68, "step_time": 7.3801007350266445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.103448390960693, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7630491852760315, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 0.1628175526857376, "kl": 0.660118080675602, "learning_rate": 7.999998386671134e-06, "loss": -0.0665, "num_tokens": 1447033.0, "reward": 0.727554440498352, "reward_std": 0.8084065318107605, "rewards/rollout_reward_func/mean": 0.727554440498352, "rewards/rollout_reward_func/std": 0.8084065914154053, "sampling/importance_sampling_ratio/max": 1.7641993761062622, "sampling/importance_sampling_ratio/mean": 0.8098151087760925, "sampling/importance_sampling_ratio/min": 5.771688847744372e-06, "sampling/sampling_logp_difference/max": 1.840256690979004, "sampling/sampling_logp_difference/mean": 0.3709368109703064, "step": 69, "step_time": 13.484722186956787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011578947305679321, "clip_ratio/low_min": 0.009999999776482582, "clip_ratio/region_mean": 0.011578947305679321, "entropy": 1.7583994567394257, "epoch": 0.0014, "grad_norm": 0.13395115733146667, "kl": 0.6945981979370117, "learning_rate": 7.999998287412158e-06, "loss": -0.067, "step": 70, "step_time": 7.3999836160219274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007352941203862429, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.793103218078613, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7098856270313263, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 0.23699568212032318, "kl": 0.6070269197225571, "learning_rate": 7.99999818519023e-06, "loss": -0.0977, "num_tokens": 1489725.0, "reward": 0.5650325417518616, "reward_std": 0.7971102595329285, "rewards/rollout_reward_func/mean": 0.5650325417518616, "rewards/rollout_reward_func/std": 0.7971101999282837, "sampling/importance_sampling_ratio/max": 2.4306459426879883, "sampling/importance_sampling_ratio/mean": 0.876433253288269, "sampling/importance_sampling_ratio/min": 3.965717088050269e-09, "sampling/sampling_logp_difference/max": 2.8278090953826904, "sampling/sampling_logp_difference/mean": 0.4479917287826538, "step": 71, "step_time": 13.560595127986744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014297385700047016, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014297385700047016, "entropy": 1.7105233371257782, "epoch": 0.00144, "grad_norm": 0.2518865466117859, "kl": 0.6589539498090744, "learning_rate": 7.999998080005348e-06, "loss": -0.0984, "step": 72, "step_time": 7.408633593993727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 3.461538553237915, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.935670167207718, "epoch": 0.00146, "frac_reward_zero_std": 0.125, "grad_norm": 0.11711350828409195, "kl": 0.8901127725839615, "learning_rate": 7.999997971857512e-06, "loss": -0.065, "num_tokens": 1532890.0, "reward": 0.7951894998550415, "reward_std": 0.4443667531013489, "rewards/rollout_reward_func/mean": 0.7951894998550415, "rewards/rollout_reward_func/std": 0.4443667531013489, "sampling/importance_sampling_ratio/max": 2.0024776458740234, "sampling/importance_sampling_ratio/mean": 0.9531119465827942, "sampling/importance_sampling_ratio/min": 7.885585517897198e-08, "sampling/sampling_logp_difference/max": 2.3513379096984863, "sampling/sampling_logp_difference/mean": 0.4450494647026062, "step": 73, "step_time": 13.715363928989973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9400164783000946, "epoch": 0.00148, "grad_norm": 0.10475299507379532, "kl": 0.9156396687030792, "learning_rate": 7.999997860746726e-06, "loss": -0.0655, "step": 74, "step_time": 7.408709482959239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7354890704154968, "epoch": 0.0015, "frac_reward_zero_std": 0.25, "grad_norm": 0.29877564311027527, "kl": 3.7605235874652863, "learning_rate": 7.999997746672985e-06, "loss": -0.0662, "num_tokens": 1573523.0, "reward": 0.759925127029419, "reward_std": 0.7369545102119446, "rewards/rollout_reward_func/mean": 0.759925127029419, "rewards/rollout_reward_func/std": 0.7369545102119446, "sampling/importance_sampling_ratio/max": 2.030880928039551, "sampling/importance_sampling_ratio/mean": 0.8955699801445007, "sampling/importance_sampling_ratio/min": 2.6783709472510964e-07, "sampling/sampling_logp_difference/max": 2.8030624389648438, "sampling/sampling_logp_difference/mean": 0.44635009765625, "step": 75, "step_time": 13.808871456014458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7459544837474823, "epoch": 0.00152, "grad_norm": 0.25723323225975037, "kl": 3.3666553646326065, "learning_rate": 7.999997629636291e-06, "loss": -0.0674, "step": 76, "step_time": 7.926285246008774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 3.3214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3568529188632965, "epoch": 0.00154, "frac_reward_zero_std": 0.375, "grad_norm": 0.14872075617313385, "kl": 0.7950456589460373, "learning_rate": 7.999997509636644e-06, "loss": -0.041, "num_tokens": 1616675.0, "reward": 0.6454828381538391, "reward_std": 0.7320577502250671, "rewards/rollout_reward_func/mean": 0.6454828381538391, "rewards/rollout_reward_func/std": 0.7320576906204224, "sampling/importance_sampling_ratio/max": 1.8992130756378174, "sampling/importance_sampling_ratio/mean": 0.8234416246414185, "sampling/importance_sampling_ratio/min": 2.105470642277396e-08, "sampling/sampling_logp_difference/max": 2.314755439758301, "sampling/sampling_logp_difference/mean": 0.4737025499343872, "step": 77, "step_time": 13.44858529602061 }, { "clip_ratio/high_max": 0.030833333730697632, "clip_ratio/high_mean": 0.015416666865348816, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015416666865348816, "entropy": 1.37718865275383, "epoch": 0.00156, "grad_norm": 0.0941656306385994, "kl": 0.7162396907806396, "learning_rate": 7.999997386674047e-06, "loss": -0.0415, "step": 78, "step_time": 7.4274103340285365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.178571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4422635585069656, "epoch": 0.00158, "frac_reward_zero_std": 0.125, "grad_norm": 0.12199632823467255, "kl": 0.6229566782712936, "learning_rate": 7.999997260748495e-06, "loss": -0.0811, "num_tokens": 1657178.0, "reward": 0.5837007761001587, "reward_std": 0.648729681968689, "rewards/rollout_reward_func/mean": 0.5837007761001587, "rewards/rollout_reward_func/std": 0.6487297415733337, "sampling/importance_sampling_ratio/max": 1.681526780128479, "sampling/importance_sampling_ratio/mean": 0.8960216641426086, "sampling/importance_sampling_ratio/min": 4.8091565986396745e-05, "sampling/sampling_logp_difference/max": 3.080669641494751, "sampling/sampling_logp_difference/mean": 0.3445296883583069, "step": 79, "step_time": 13.369345822022296 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.4651570916175842, "epoch": 0.0016, "grad_norm": 0.12123571336269379, "kl": 0.6027359515428543, "learning_rate": 7.999997131859992e-06, "loss": -0.0811, "step": 80, "step_time": 7.350731488986639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5104852318763733, "epoch": 0.00162, "frac_reward_zero_std": 0.375, "grad_norm": 0.1312926709651947, "kl": 0.6267119571566582, "learning_rate": 7.999997000008536e-06, "loss": -0.0676, "num_tokens": 1697903.0, "reward": 0.683377206325531, "reward_std": 0.7098913788795471, "rewards/rollout_reward_func/mean": 0.683377206325531, "rewards/rollout_reward_func/std": 0.7098913192749023, "sampling/importance_sampling_ratio/max": 2.1206657886505127, "sampling/importance_sampling_ratio/mean": 0.9270124435424805, "sampling/importance_sampling_ratio/min": 2.145379585272167e-05, "sampling/sampling_logp_difference/max": 4.145277976989746, "sampling/sampling_logp_difference/mean": 0.3648242950439453, "step": 81, "step_time": 13.177107774012256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5274416208267212, "epoch": 0.00164, "grad_norm": 0.12942464649677277, "kl": 0.6211781874299049, "learning_rate": 7.999996865194129e-06, "loss": -0.0678, "step": 82, "step_time": 7.353099802014185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.806451320648193, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2064611613750458, "epoch": 0.00166, "frac_reward_zero_std": 0.25, "grad_norm": 0.17056311666965485, "kl": 0.4914882108569145, "learning_rate": 7.99999672741677e-06, "loss": -0.0549, "num_tokens": 1737366.0, "reward": 0.8135565519332886, "reward_std": 0.7889710068702698, "rewards/rollout_reward_func/mean": 0.8135565519332886, "rewards/rollout_reward_func/std": 0.7889710068702698, "sampling/importance_sampling_ratio/max": 1.6824474334716797, "sampling/importance_sampling_ratio/mean": 0.9140665531158447, "sampling/importance_sampling_ratio/min": 1.2870827958977316e-06, "sampling/sampling_logp_difference/max": 2.343477964401245, "sampling/sampling_logp_difference/mean": 0.300786554813385, "step": 83, "step_time": 13.140126074984437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2115330770611763, "epoch": 0.00168, "grad_norm": 0.17092265188694, "kl": 0.4938216358423233, "learning_rate": 7.999996586676458e-06, "loss": -0.0556, "step": 84, "step_time": 8.04575577302603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.551723957061768, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8141814172267914, "epoch": 0.0017, "frac_reward_zero_std": 0.125, "grad_norm": 0.18001334369182587, "kl": 1.0656866878271103, "learning_rate": 7.999996442973193e-06, "loss": -0.0695, "num_tokens": 1779169.0, "reward": 0.7263854146003723, "reward_std": 0.6007992029190063, "rewards/rollout_reward_func/mean": 0.7263854146003723, "rewards/rollout_reward_func/std": 0.6007992029190063, "sampling/importance_sampling_ratio/max": 1.873450756072998, "sampling/importance_sampling_ratio/mean": 0.8422355651855469, "sampling/importance_sampling_ratio/min": 1.0910101710948084e-09, "sampling/sampling_logp_difference/max": 2.7151405811309814, "sampling/sampling_logp_difference/mean": 0.4595716595649719, "step": 85, "step_time": 13.51167939096922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8181213736534119, "epoch": 0.00172, "grad_norm": 0.15755346417427063, "kl": 0.9771237522363663, "learning_rate": 7.99999629630698e-06, "loss": -0.0699, "step": 86, "step_time": 7.78755507801543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0259642004966736, "epoch": 0.00174, "frac_reward_zero_std": 0.125, "grad_norm": 0.09383644908666611, "kl": 0.8567705824971199, "learning_rate": 7.999996146677813e-06, "loss": -0.0677, "num_tokens": 1821623.0, "reward": 0.7134997844696045, "reward_std": 0.6498125791549683, "rewards/rollout_reward_func/mean": 0.7134997844696045, "rewards/rollout_reward_func/std": 0.6498125791549683, "sampling/importance_sampling_ratio/max": 2.0679726600646973, "sampling/importance_sampling_ratio/mean": 0.7940202951431274, "sampling/importance_sampling_ratio/min": 1.447487676387027e-07, "sampling/sampling_logp_difference/max": 2.374112844467163, "sampling/sampling_logp_difference/mean": 0.43251538276672363, "step": 87, "step_time": 13.702273792994674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0261653661727905, "epoch": 0.00176, "grad_norm": 0.08854828029870987, "kl": 0.8215272054076195, "learning_rate": 7.999995994085696e-06, "loss": -0.068, "step": 88, "step_time": 7.387128681963077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.737884372472763, "epoch": 0.00178, "frac_reward_zero_std": 0.125, "grad_norm": 0.14514029026031494, "kl": 1.3648712635040283, "learning_rate": 7.999995838530628e-06, "loss": -0.0695, "num_tokens": 1862463.0, "reward": 0.9136732220649719, "reward_std": 0.6820932626724243, "rewards/rollout_reward_func/mean": 0.9136732220649719, "rewards/rollout_reward_func/std": 0.6820932030677795, "sampling/importance_sampling_ratio/max": 1.60533607006073, "sampling/importance_sampling_ratio/mean": 0.8111950159072876, "sampling/importance_sampling_ratio/min": 3.951832994175675e-08, "sampling/sampling_logp_difference/max": 3.046593427658081, "sampling/sampling_logp_difference/mean": 0.3985365033149719, "step": 89, "step_time": 13.512448907014914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7426247298717499, "epoch": 0.0018, "grad_norm": 0.13489937782287598, "kl": 1.23275226354599, "learning_rate": 7.99999568001261e-06, "loss": -0.0704, "step": 90, "step_time": 7.380271461966913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.125, "completions/mean_terminated_length": 3.7419352531433105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9468292221426964, "epoch": 0.00182, "frac_reward_zero_std": 0.125, "grad_norm": 0.16084791719913483, "kl": 0.5955685675144196, "learning_rate": 7.999995518531638e-06, "loss": -0.0526, "num_tokens": 1903349.0, "reward": 0.7176684141159058, "reward_std": 0.6334609985351562, "rewards/rollout_reward_func/mean": 0.7176684141159058, "rewards/rollout_reward_func/std": 0.6334609985351562, "sampling/importance_sampling_ratio/max": 1.5449637174606323, "sampling/importance_sampling_ratio/mean": 0.9340054392814636, "sampling/importance_sampling_ratio/min": 0.000326521199895069, "sampling/sampling_logp_difference/max": 2.370257616043091, "sampling/sampling_logp_difference/mean": 0.2433616667985916, "step": 91, "step_time": 12.989624284004094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9479116648435593, "epoch": 0.00184, "grad_norm": 0.16185013949871063, "kl": 0.5817837864160538, "learning_rate": 7.999995354087718e-06, "loss": -0.0532, "step": 92, "step_time": 7.284792143997038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 3.700000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3723521530628204, "epoch": 0.00186, "frac_reward_zero_std": 0.25, "grad_norm": 0.10205340385437012, "kl": 1.0326064676046371, "learning_rate": 7.999995186680847e-06, "loss": -0.0271, "num_tokens": 1945848.0, "reward": 0.653235673904419, "reward_std": 0.7152080535888672, "rewards/rollout_reward_func/mean": 0.653235673904419, "rewards/rollout_reward_func/std": 0.7152079939842224, "sampling/importance_sampling_ratio/max": 1.789787769317627, "sampling/importance_sampling_ratio/mean": 0.846100926399231, "sampling/importance_sampling_ratio/min": 4.9853056793836004e-08, "sampling/sampling_logp_difference/max": 1.9960665702819824, "sampling/sampling_logp_difference/mean": 0.3801291584968567, "step": 93, "step_time": 13.386654714966426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3749693185091019, "epoch": 0.00188, "grad_norm": 0.10649532079696655, "kl": 1.042083516716957, "learning_rate": 7.999995016311026e-06, "loss": -0.0275, "step": 94, "step_time": 8.099126882996643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2045509815216064, "epoch": 0.0019, "frac_reward_zero_std": 0.125, "grad_norm": 0.10183614492416382, "kl": 1.1438529789447784, "learning_rate": 7.999994842978255e-06, "loss": -0.052, "num_tokens": 1986031.0, "reward": 0.424629271030426, "reward_std": 0.8473767042160034, "rewards/rollout_reward_func/mean": 0.424629271030426, "rewards/rollout_reward_func/std": 0.8473766446113586, "sampling/importance_sampling_ratio/max": 1.4995328187942505, "sampling/importance_sampling_ratio/mean": 0.6132031083106995, "sampling/importance_sampling_ratio/min": 1.1587589776596019e-09, "sampling/sampling_logp_difference/max": 2.571249485015869, "sampling/sampling_logp_difference/mean": 0.507567822933197, "step": 95, "step_time": 14.371954682981595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.202300012111664, "epoch": 0.00192, "grad_norm": 0.10720271617174149, "kl": 1.1031748950481415, "learning_rate": 7.999994666682534e-06, "loss": -0.052, "step": 96, "step_time": 7.402519005001523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.0625, "completions/mean_terminated_length": 3.6774191856384277, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.139912523329258, "epoch": 0.00194, "frac_reward_zero_std": 0.625, "grad_norm": 0.08658095449209213, "kl": 0.33942941576242447, "learning_rate": 7.999994487423863e-06, "loss": -0.0224, "num_tokens": 2028423.0, "reward": 0.8417947292327881, "reward_std": 0.5534140467643738, "rewards/rollout_reward_func/mean": 0.8417947292327881, "rewards/rollout_reward_func/std": 0.5534140467643738, "sampling/importance_sampling_ratio/max": 1.8327239751815796, "sampling/importance_sampling_ratio/mean": 1.039649248123169, "sampling/importance_sampling_ratio/min": 2.0992692952859215e-06, "sampling/sampling_logp_difference/max": 1.8912296295166016, "sampling/sampling_logp_difference/mean": 0.26138585805892944, "step": 97, "step_time": 13.353252779954346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1415030658245087, "epoch": 0.00196, "grad_norm": 0.08631787449121475, "kl": 0.33610787615180016, "learning_rate": 7.999994305202242e-06, "loss": -0.0225, "step": 98, "step_time": 7.38732369398349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4061581790447235, "epoch": 0.00198, "frac_reward_zero_std": 0.25, "grad_norm": 0.14817950129508972, "kl": 1.0418185889720917, "learning_rate": 7.999994120017672e-06, "loss": -0.0431, "num_tokens": 2069675.0, "reward": 0.5654841661453247, "reward_std": 0.6796900033950806, "rewards/rollout_reward_func/mean": 0.5654841661453247, "rewards/rollout_reward_func/std": 0.6796900033950806, "sampling/importance_sampling_ratio/max": 1.4593231678009033, "sampling/importance_sampling_ratio/mean": 0.8215492367744446, "sampling/importance_sampling_ratio/min": 1.4876872000968433e-07, "sampling/sampling_logp_difference/max": 2.7828383445739746, "sampling/sampling_logp_difference/mean": 0.37129032611846924, "step": 99, "step_time": 13.42800251598237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4014839455485344, "epoch": 0.002, "grad_norm": 0.14855359494686127, "kl": 1.0043862760066986, "learning_rate": 7.999993931870152e-06, "loss": -0.0431, "step": 100, "step_time": 7.333227452967549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.3125, "completions/mean_terminated_length": 3.935483694076538, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4609520137310028, "epoch": 0.00202, "frac_reward_zero_std": 0.5, "grad_norm": 0.11193384975194931, "kl": 0.4065626412630081, "learning_rate": 7.999993740759685e-06, "loss": -0.0401, "num_tokens": 2111491.0, "reward": 0.7881255149841309, "reward_std": 0.6281613707542419, "rewards/rollout_reward_func/mean": 0.7881255149841309, "rewards/rollout_reward_func/std": 0.6281613111495972, "sampling/importance_sampling_ratio/max": 1.5830180644989014, "sampling/importance_sampling_ratio/mean": 1.0739452838897705, "sampling/importance_sampling_ratio/min": 1.670182712132373e-07, "sampling/sampling_logp_difference/max": 1.9727630615234375, "sampling/sampling_logp_difference/mean": 0.3505682945251465, "step": 101, "step_time": 13.29214097201475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.456451192498207, "epoch": 0.00204, "grad_norm": 0.11461129784584045, "kl": 0.40108664333820343, "learning_rate": 7.999993546686268e-06, "loss": -0.04, "step": 102, "step_time": 7.386168880999321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6033264696598053, "epoch": 0.00206, "frac_reward_zero_std": 0.25, "grad_norm": 0.09053642302751541, "kl": 0.7574311792850494, "learning_rate": 7.999993349649902e-06, "loss": -0.0313, "num_tokens": 2153761.0, "reward": 0.5206936597824097, "reward_std": 0.7109124064445496, "rewards/rollout_reward_func/mean": 0.5206936597824097, "rewards/rollout_reward_func/std": 0.7109124064445496, "sampling/importance_sampling_ratio/max": 1.764373779296875, "sampling/importance_sampling_ratio/mean": 0.8510134816169739, "sampling/importance_sampling_ratio/min": 2.9842566195981135e-09, "sampling/sampling_logp_difference/max": 1.982487678527832, "sampling/sampling_logp_difference/mean": 0.41302794218063354, "step": 103, "step_time": 13.840266868995968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5997430235147476, "epoch": 0.00208, "grad_norm": 0.09049440920352936, "kl": 0.7414917200803757, "learning_rate": 7.999993149650587e-06, "loss": -0.0315, "step": 104, "step_time": 7.403468825970776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.71875, "completions/mean_terminated_length": 3.9666669368743896, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5171255320310593, "epoch": 0.0021, "frac_reward_zero_std": 0.375, "grad_norm": 0.07308118045330048, "kl": 0.5007425397634506, "learning_rate": 7.999992946688324e-06, "loss": -0.0559, "num_tokens": 2195716.0, "reward": 0.8160977959632874, "reward_std": 0.5393236875534058, "rewards/rollout_reward_func/mean": 0.8160977959632874, "rewards/rollout_reward_func/std": 0.5393237471580505, "sampling/importance_sampling_ratio/max": 2.061547040939331, "sampling/importance_sampling_ratio/mean": 1.0483710765838623, "sampling/importance_sampling_ratio/min": 1.1222250577702653e-06, "sampling/sampling_logp_difference/max": 2.064573049545288, "sampling/sampling_logp_difference/mean": 0.33967095613479614, "step": 105, "step_time": 14.18994961300632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.509080447256565, "epoch": 0.00212, "grad_norm": 0.07318316400051117, "kl": 0.5067119896411896, "learning_rate": 7.999992740763114e-06, "loss": -0.0562, "step": 106, "step_time": 7.367917214985937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 5.111111164093018, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.350807934999466, "epoch": 0.00214, "frac_reward_zero_std": 0.0, "grad_norm": 0.14618659019470215, "kl": 0.7124654725193977, "learning_rate": 7.999992531874955e-06, "loss": -0.0781, "num_tokens": 2234629.0, "reward": 0.4058985710144043, "reward_std": 0.8973264098167419, "rewards/rollout_reward_func/mean": 0.4058985710144043, "rewards/rollout_reward_func/std": 0.8973264098167419, "sampling/importance_sampling_ratio/max": 1.8355087041854858, "sampling/importance_sampling_ratio/mean": 0.7537065744400024, "sampling/importance_sampling_ratio/min": 2.721579184239431e-09, "sampling/sampling_logp_difference/max": 2.5141849517822266, "sampling/sampling_logp_difference/mean": 0.4745604693889618, "step": 107, "step_time": 13.509204244997818 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.3402808606624603, "epoch": 0.00216, "grad_norm": 0.1349266916513443, "kl": 0.6856067776679993, "learning_rate": 7.99999232002385e-06, "loss": -0.0787, "step": 108, "step_time": 7.340252080990467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1012399196624756, "epoch": 0.00218, "frac_reward_zero_std": 0.25, "grad_norm": 0.12561196088790894, "kl": 0.39643602073192596, "learning_rate": 7.999992105209796e-06, "loss": -0.0568, "num_tokens": 2276659.0, "reward": 0.6414012908935547, "reward_std": 0.7119438648223877, "rewards/rollout_reward_func/mean": 0.6414012908935547, "rewards/rollout_reward_func/std": 0.7119438052177429, "sampling/importance_sampling_ratio/max": 1.4566899538040161, "sampling/importance_sampling_ratio/mean": 0.8980593085289001, "sampling/importance_sampling_ratio/min": 7.115548896763357e-07, "sampling/sampling_logp_difference/max": 2.241684913635254, "sampling/sampling_logp_difference/mean": 0.4511716365814209, "step": 109, "step_time": 13.67465943400748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.0982561707496643, "epoch": 0.0022, "grad_norm": 0.12452337890863419, "kl": 0.3838076665997505, "learning_rate": 7.999991887432795e-06, "loss": -0.0568, "step": 110, "step_time": 7.391446803958388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8518780171871185, "epoch": 0.00222, "frac_reward_zero_std": 0.125, "grad_norm": 0.14721091091632843, "kl": 0.43586038053035736, "learning_rate": 7.999991666692848e-06, "loss": -0.0552, "num_tokens": 2319251.0, "reward": 0.6414339542388916, "reward_std": 0.7178946137428284, "rewards/rollout_reward_func/mean": 0.6414339542388916, "rewards/rollout_reward_func/std": 0.7178946137428284, "sampling/importance_sampling_ratio/max": 2.123800039291382, "sampling/importance_sampling_ratio/mean": 0.9124401807785034, "sampling/importance_sampling_ratio/min": 1.5324381763548445e-09, "sampling/sampling_logp_difference/max": 2.524271249771118, "sampling/sampling_logp_difference/mean": 0.467507928609848, "step": 111, "step_time": 13.676378069038037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8412518501281738, "epoch": 0.00224, "grad_norm": 0.1430835872888565, "kl": 0.4364122897386551, "learning_rate": 7.999991442989953e-06, "loss": -0.0556, "step": 112, "step_time": 7.462103360972833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5657634660601616, "epoch": 0.00226, "frac_reward_zero_std": 0.25, "grad_norm": 0.2353268414735794, "kl": 0.6036189198493958, "learning_rate": 7.999991216324112e-06, "loss": -0.0465, "num_tokens": 2357303.0, "reward": 0.9132423400878906, "reward_std": 0.6479337215423584, "rewards/rollout_reward_func/mean": 0.9132423400878906, "rewards/rollout_reward_func/std": 0.6479337215423584, "sampling/importance_sampling_ratio/max": 1.5804269313812256, "sampling/importance_sampling_ratio/mean": 0.7604790925979614, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7836416959762573, "sampling/sampling_logp_difference/mean": 0.3856145739555359, "step": 113, "step_time": 14.137382863991661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.5561434924602509, "epoch": 0.00228, "grad_norm": 0.0884590819478035, "kl": 0.637834221124649, "learning_rate": 7.999990986695325e-06, "loss": -0.0472, "step": 114, "step_time": 7.759363209042931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 3.8518519401550293, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5791166722774506, "epoch": 0.0023, "frac_reward_zero_std": 0.125, "grad_norm": 0.243779718875885, "kl": 0.5725055113434792, "learning_rate": 7.999990754103591e-06, "loss": -0.016, "num_tokens": 2399621.0, "reward": 0.3672381639480591, "reward_std": 0.7664651870727539, "rewards/rollout_reward_func/mean": 0.3672381639480591, "rewards/rollout_reward_func/std": 0.7664651870727539, "sampling/importance_sampling_ratio/max": 2.2509703636169434, "sampling/importance_sampling_ratio/mean": 0.9078182578086853, "sampling/importance_sampling_ratio/min": 3.6465483077563476e-08, "sampling/sampling_logp_difference/max": 2.822394371032715, "sampling/sampling_logp_difference/mean": 0.43570443987846375, "step": 115, "step_time": 13.562330630957149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5754790604114532, "epoch": 0.00232, "grad_norm": 0.2594475746154785, "kl": 0.5907891392707825, "learning_rate": 7.99999051854891e-06, "loss": -0.0177, "step": 116, "step_time": 7.402268615027424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 3.965517282485962, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3532088100910187, "epoch": 0.00234, "frac_reward_zero_std": 0.25, "grad_norm": 0.09621760249137878, "kl": 0.7460349947214127, "learning_rate": 7.999990280031285e-06, "loss": -0.0765, "num_tokens": 2438416.0, "reward": 0.6759672164916992, "reward_std": 0.6970655918121338, "rewards/rollout_reward_func/mean": 0.6759672164916992, "rewards/rollout_reward_func/std": 0.6970655918121338, "sampling/importance_sampling_ratio/max": 1.6706056594848633, "sampling/importance_sampling_ratio/mean": 0.8806052207946777, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.5007681846618652, "sampling/sampling_logp_difference/mean": 0.44310176372528076, "step": 117, "step_time": 13.130796202021884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3436716347932816, "epoch": 0.00236, "grad_norm": 0.097286656498909, "kl": 0.7125769630074501, "learning_rate": 7.999990038550715e-06, "loss": -0.0767, "step": 118, "step_time": 7.278747385047609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 3.5714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6698466688394547, "epoch": 0.00238, "frac_reward_zero_std": 0.375, "grad_norm": 0.17976994812488556, "kl": 0.9608743041753769, "learning_rate": 7.9999897941072e-06, "loss": -0.0658, "num_tokens": 2480754.0, "reward": 0.6768029928207397, "reward_std": 0.6971518397331238, "rewards/rollout_reward_func/mean": 0.6768029928207397, "rewards/rollout_reward_func/std": 0.697151780128479, "sampling/importance_sampling_ratio/max": 2.291323184967041, "sampling/importance_sampling_ratio/mean": 0.8315238952636719, "sampling/importance_sampling_ratio/min": 5.801308045061404e-11, "sampling/sampling_logp_difference/max": 2.2250609397888184, "sampling/sampling_logp_difference/mean": 0.531991183757782, "step": 119, "step_time": 13.370235940004932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6660777628421783, "epoch": 0.0024, "grad_norm": 0.18669870495796204, "kl": 0.9914654716849327, "learning_rate": 7.999989546700739e-06, "loss": -0.0663, "step": 120, "step_time": 7.391478910954902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.96875, "completions/mean_terminated_length": 4.233333587646484, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2270509079098701, "epoch": 0.00242, "frac_reward_zero_std": 0.25, "grad_norm": 0.1232658326625824, "kl": 0.42613787204027176, "learning_rate": 7.999989296331334e-06, "loss": -0.062, "num_tokens": 2523601.0, "reward": 0.7338577508926392, "reward_std": 0.6182498931884766, "rewards/rollout_reward_func/mean": 0.7338577508926392, "rewards/rollout_reward_func/std": 0.6182499527931213, "sampling/importance_sampling_ratio/max": 1.8281406164169312, "sampling/importance_sampling_ratio/mean": 0.8923301100730896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7545595169067383, "sampling/sampling_logp_difference/mean": 0.2948044240474701, "step": 121, "step_time": 13.547385791956913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02512290421873331, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02512290421873331, "entropy": 1.2247934341430664, "epoch": 0.00244, "grad_norm": 0.12976936995983124, "kl": 0.46046049892902374, "learning_rate": 7.999989042998983e-06, "loss": -0.0623, "step": 122, "step_time": 8.020124083996052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 3.8620688915252686, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4331515952944756, "epoch": 0.00246, "frac_reward_zero_std": 0.25, "grad_norm": 0.12389996647834778, "kl": 0.5737656056880951, "learning_rate": 7.99998878670369e-06, "loss": -0.074, "num_tokens": 2565435.0, "reward": 0.8166020512580872, "reward_std": 0.6040683388710022, "rewards/rollout_reward_func/mean": 0.8166020512580872, "rewards/rollout_reward_func/std": 0.6040682792663574, "sampling/importance_sampling_ratio/max": 2.934326648712158, "sampling/importance_sampling_ratio/mean": 0.995736837387085, "sampling/importance_sampling_ratio/min": 2.049709109996911e-06, "sampling/sampling_logp_difference/max": 2.7275702953338623, "sampling/sampling_logp_difference/mean": 0.3495370149612427, "step": 123, "step_time": 13.366294720995938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01226076576858759, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01226076576858759, "entropy": 1.4211450070142746, "epoch": 0.00248, "grad_norm": 0.1207331195473671, "kl": 0.6440499611198902, "learning_rate": 7.999988527445453e-06, "loss": -0.0743, "step": 124, "step_time": 7.964402243960649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.043478488922119, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5995678305625916, "epoch": 0.0025, "frac_reward_zero_std": 0.125, "grad_norm": 0.15041887760162354, "kl": 0.9455753639340401, "learning_rate": 7.99998826522427e-06, "loss": -0.0563, "num_tokens": 2604916.0, "reward": 0.39660653471946716, "reward_std": 0.8684905171394348, "rewards/rollout_reward_func/mean": 0.39660653471946716, "rewards/rollout_reward_func/std": 0.8684905171394348, "sampling/importance_sampling_ratio/max": 1.5988229513168335, "sampling/importance_sampling_ratio/mean": 0.6902393102645874, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.471303939819336, "sampling/sampling_logp_difference/mean": 0.6626202464103699, "step": 125, "step_time": 13.680438776005758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.600655972957611, "epoch": 0.00252, "grad_norm": 0.15486709773540497, "kl": 1.0422997325658798, "learning_rate": 7.999988000040144e-06, "loss": -0.0561, "step": 126, "step_time": 7.3007075730129145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 3.359999895095825, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.637642353773117, "epoch": 0.00254, "frac_reward_zero_std": 0.125, "grad_norm": 0.15007339417934418, "kl": 1.0965794622898102, "learning_rate": 7.999987731893076e-06, "loss": -0.054, "num_tokens": 2647764.0, "reward": 0.5223478078842163, "reward_std": 0.7955189347267151, "rewards/rollout_reward_func/mean": 0.5223478078842163, "rewards/rollout_reward_func/std": 0.7955189347267151, "sampling/importance_sampling_ratio/max": 1.4759010076522827, "sampling/importance_sampling_ratio/mean": 0.8478649854660034, "sampling/importance_sampling_ratio/min": 2.024634453146046e-11, "sampling/sampling_logp_difference/max": 2.5649099349975586, "sampling/sampling_logp_difference/mean": 0.5830302238464355, "step": 127, "step_time": 13.79945693598711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6404869854450226, "epoch": 0.00256, "grad_norm": 0.13915136456489563, "kl": 1.050971195101738, "learning_rate": 7.999987460783066e-06, "loss": -0.0544, "step": 128, "step_time": 7.384202719986206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 3.7857143878936768, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7830494791269302, "epoch": 0.00258, "frac_reward_zero_std": 0.25, "grad_norm": 0.5898280143737793, "kl": 1.8552266508340836, "learning_rate": 7.999987186710111e-06, "loss": -0.0542, "num_tokens": 2690249.0, "reward": 0.32545381784439087, "reward_std": 0.8310579657554626, "rewards/rollout_reward_func/mean": 0.32545381784439087, "rewards/rollout_reward_func/std": 0.8310579657554626, "sampling/importance_sampling_ratio/max": 1.6024534702301025, "sampling/importance_sampling_ratio/mean": 0.8474000692367554, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.681257963180542, "sampling/sampling_logp_difference/mean": 0.4551326632499695, "step": 129, "step_time": 13.514088444993831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7797339260578156, "epoch": 0.0026, "grad_norm": 0.1431848704814911, "kl": 0.7712808847427368, "learning_rate": 7.999986909674215e-06, "loss": -0.0567, "step": 130, "step_time": 7.390377140982309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 3.4827585220336914, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1882009506225586, "epoch": 0.00262, "frac_reward_zero_std": 0.25, "grad_norm": 0.15668253600597382, "kl": 0.5982630550861359, "learning_rate": 7.999986629675377e-06, "loss": -0.0465, "num_tokens": 2732943.0, "reward": 0.6070648431777954, "reward_std": 0.6204836964607239, "rewards/rollout_reward_func/mean": 0.6070648431777954, "rewards/rollout_reward_func/std": 0.6204836964607239, "sampling/importance_sampling_ratio/max": 1.840167760848999, "sampling/importance_sampling_ratio/mean": 0.9704622030258179, "sampling/importance_sampling_ratio/min": 1.0926180038950406e-05, "sampling/sampling_logp_difference/max": 1.7847546339035034, "sampling/sampling_logp_difference/mean": 0.2816389799118042, "step": 131, "step_time": 13.541249369969591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1866857707500458, "epoch": 0.00264, "grad_norm": 0.15712019801139832, "kl": 0.5760191082954407, "learning_rate": 7.999986346713597e-06, "loss": -0.0474, "step": 132, "step_time": 7.8868556049710605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.700347751379013, "epoch": 0.00266, "frac_reward_zero_std": 0.25, "grad_norm": 0.11548580974340439, "kl": 0.5758876651525497, "learning_rate": 7.999986060788874e-06, "loss": -0.0275, "num_tokens": 2774073.0, "reward": 0.7380505800247192, "reward_std": 0.6604363918304443, "rewards/rollout_reward_func/mean": 0.7380505800247192, "rewards/rollout_reward_func/std": 0.6604364514350891, "sampling/importance_sampling_ratio/max": 1.5590382814407349, "sampling/importance_sampling_ratio/mean": 0.860594630241394, "sampling/importance_sampling_ratio/min": 2.0171512005617842e-05, "sampling/sampling_logp_difference/max": 2.153215169906616, "sampling/sampling_logp_difference/mean": 0.38668209314346313, "step": 133, "step_time": 13.253368017991306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7089445888996124, "epoch": 0.00268, "grad_norm": 0.11654999107122421, "kl": 0.5405199378728867, "learning_rate": 7.999985771901212e-06, "loss": -0.0281, "step": 134, "step_time": 7.762957149010617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.349157489836216, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 0.13211940228939056, "kl": 0.7673752978444099, "learning_rate": 7.999985480050609e-06, "loss": -0.0695, "num_tokens": 2813466.0, "reward": 0.5724626183509827, "reward_std": 0.8392655849456787, "rewards/rollout_reward_func/mean": 0.5724626183509827, "rewards/rollout_reward_func/std": 0.8392655253410339, "sampling/importance_sampling_ratio/max": 1.6873210668563843, "sampling/importance_sampling_ratio/mean": 0.8715179562568665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.385127544403076, "sampling/sampling_logp_difference/mean": 0.3196907937526703, "step": 135, "step_time": 13.035386831004871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.350214384496212, "epoch": 0.00272, "grad_norm": 0.10614006966352463, "kl": 0.6813817843794823, "learning_rate": 7.999985185237063e-06, "loss": -0.0695, "step": 136, "step_time": 7.1527684350148775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9065848290920258, "epoch": 0.00274, "frac_reward_zero_std": 0.0, "grad_norm": 0.127616286277771, "kl": 0.5647574290633202, "learning_rate": 7.999984887460579e-06, "loss": -0.073, "num_tokens": 2854308.0, "reward": 0.6534372568130493, "reward_std": 0.778069019317627, "rewards/rollout_reward_func/mean": 0.6534372568130493, "rewards/rollout_reward_func/std": 0.7780690789222717, "sampling/importance_sampling_ratio/max": 1.693185806274414, "sampling/importance_sampling_ratio/mean": 0.7484979629516602, "sampling/importance_sampling_ratio/min": 1.0952532647934277e-05, "sampling/sampling_logp_difference/max": 2.4697365760803223, "sampling/sampling_logp_difference/mean": 0.4136084318161011, "step": 137, "step_time": 13.508384881017264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8996959030628204, "epoch": 0.00276, "grad_norm": 0.12982730567455292, "kl": 0.5468348599970341, "learning_rate": 7.999984586721153e-06, "loss": -0.0733, "step": 138, "step_time": 7.365648780978518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.308943808078766, "epoch": 0.00278, "frac_reward_zero_std": 0.25, "grad_norm": 0.1335449516773224, "kl": 0.4674496129155159, "learning_rate": 7.999984283018788e-06, "loss": -0.0532, "num_tokens": 2894626.0, "reward": 0.6393507719039917, "reward_std": 0.846705973148346, "rewards/rollout_reward_func/mean": 0.6393507719039917, "rewards/rollout_reward_func/std": 0.8467059135437012, "sampling/importance_sampling_ratio/max": 1.5210384130477905, "sampling/importance_sampling_ratio/mean": 0.7399086356163025, "sampling/importance_sampling_ratio/min": 8.25998469622391e-09, "sampling/sampling_logp_difference/max": 2.5863571166992188, "sampling/sampling_logp_difference/mean": 0.5010234117507935, "step": 139, "step_time": 13.480961201974424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3162354230880737, "epoch": 0.0028, "grad_norm": 0.13288763165473938, "kl": 0.4530162326991558, "learning_rate": 7.999983976353484e-06, "loss": -0.0533, "step": 140, "step_time": 7.37070978799602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 3.555555582046509, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8455000519752502, "epoch": 0.00282, "frac_reward_zero_std": 0.25, "grad_norm": 0.24121099710464478, "kl": 0.530284196138382, "learning_rate": 7.99998366672524e-06, "loss": -0.0617, "num_tokens": 2936657.0, "reward": 0.6294205188751221, "reward_std": 0.7927162051200867, "rewards/rollout_reward_func/mean": 0.6294205188751221, "rewards/rollout_reward_func/std": 0.7927162051200867, "sampling/importance_sampling_ratio/max": 1.6491268873214722, "sampling/importance_sampling_ratio/mean": 0.8814723491668701, "sampling/importance_sampling_ratio/min": 4.279440801724377e-09, "sampling/sampling_logp_difference/max": 2.7799735069274902, "sampling/sampling_logp_difference/mean": 0.40204256772994995, "step": 141, "step_time": 13.52643806595006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.856543093919754, "epoch": 0.00284, "grad_norm": 0.2337615042924881, "kl": 0.5292752422392368, "learning_rate": 7.999983354134058e-06, "loss": -0.0624, "step": 142, "step_time": 7.797203859983711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.034482955932617, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8495520055294037, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 0.2036367952823639, "kl": 0.6647381708025932, "learning_rate": 7.999983038579937e-06, "loss": -0.1066, "num_tokens": 2977184.0, "reward": 0.3285282254219055, "reward_std": 0.8000537157058716, "rewards/rollout_reward_func/mean": 0.3285282254219055, "rewards/rollout_reward_func/std": 0.8000537157058716, "sampling/importance_sampling_ratio/max": 2.5709586143493652, "sampling/importance_sampling_ratio/mean": 0.8426539897918701, "sampling/importance_sampling_ratio/min": 1.0506481885386165e-05, "sampling/sampling_logp_difference/max": 1.7975220680236816, "sampling/sampling_logp_difference/mean": 0.3646942377090454, "step": 143, "step_time": 13.533982652967097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.845811277627945, "epoch": 0.00288, "grad_norm": 0.19617998600006104, "kl": 0.671777568757534, "learning_rate": 7.999982720062878e-06, "loss": -0.1075, "step": 144, "step_time": 7.8253811780014075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5612438917160034, "epoch": 0.0029, "frac_reward_zero_std": 0.125, "grad_norm": 0.08279308676719666, "kl": 0.530586376786232, "learning_rate": 7.99998239858288e-06, "loss": -0.0455, "num_tokens": 3017635.0, "reward": 0.8473905324935913, "reward_std": 0.6664355993270874, "rewards/rollout_reward_func/mean": 0.8473905324935913, "rewards/rollout_reward_func/std": 0.6664355993270874, "sampling/importance_sampling_ratio/max": 1.4107486009597778, "sampling/importance_sampling_ratio/mean": 0.7874868512153625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.0444841384887695, "sampling/sampling_logp_difference/mean": 0.33104634284973145, "step": 145, "step_time": 13.571312008978566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.5592286586761475, "epoch": 0.00292, "grad_norm": 0.09206055849790573, "kl": 0.5728097558021545, "learning_rate": 7.999982074139944e-06, "loss": -0.0459, "step": 146, "step_time": 7.351400557003217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 3.9259259700775146, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0416519939899445, "epoch": 0.00294, "frac_reward_zero_std": 0.25, "grad_norm": 0.10206233710050583, "kl": 0.3854028061032295, "learning_rate": 7.999981746734073e-06, "loss": -0.0586, "num_tokens": 3057180.0, "reward": 0.7533062100410461, "reward_std": 0.7488416433334351, "rewards/rollout_reward_func/mean": 0.7533062100410461, "rewards/rollout_reward_func/std": 0.7488415241241455, "sampling/importance_sampling_ratio/max": 1.5733379125595093, "sampling/importance_sampling_ratio/mean": 0.8150993585586548, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8562707901000977, "sampling/sampling_logp_difference/mean": 0.4780171811580658, "step": 147, "step_time": 13.28569485002663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.025366961956024, "epoch": 0.00296, "grad_norm": 0.10355637222528458, "kl": 0.4076516404747963, "learning_rate": 7.999981416365263e-06, "loss": -0.059, "step": 148, "step_time": 7.34604433196364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 5.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0063598081469536, "epoch": 0.00298, "frac_reward_zero_std": 0.125, "grad_norm": 0.2445768266916275, "kl": 1.0303535014390945, "learning_rate": 7.999981083033518e-06, "loss": -0.055, "num_tokens": 3097073.0, "reward": 0.5754845142364502, "reward_std": 0.7734224200248718, "rewards/rollout_reward_func/mean": 0.5754845142364502, "rewards/rollout_reward_func/std": 0.7734223008155823, "sampling/importance_sampling_ratio/max": 2.0410220623016357, "sampling/importance_sampling_ratio/mean": 0.8661245107650757, "sampling/importance_sampling_ratio/min": 8.094350050669163e-06, "sampling/sampling_logp_difference/max": 3.7695975303649902, "sampling/sampling_logp_difference/mean": 0.30924391746520996, "step": 149, "step_time": 13.265723329008324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 0.9986613839864731, "epoch": 0.003, "grad_norm": 0.24535596370697021, "kl": 1.1001570969820023, "learning_rate": 7.999980746738835e-06, "loss": -0.0557, "step": 150, "step_time": 7.333964282006491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 3.700000286102295, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0624167919158936, "epoch": 0.00302, "frac_reward_zero_std": 0.375, "grad_norm": 0.1255745142698288, "kl": 1.7142913937568665, "learning_rate": 7.999980407481217e-06, "loss": -0.0568, "num_tokens": 3139134.0, "reward": 0.5353561639785767, "reward_std": 0.7784624099731445, "rewards/rollout_reward_func/mean": 0.5353561639785767, "rewards/rollout_reward_func/std": 0.7784624099731445, "sampling/importance_sampling_ratio/max": 2.199014902114868, "sampling/importance_sampling_ratio/mean": 1.0242780447006226, "sampling/importance_sampling_ratio/min": 1.4789466149522923e-05, "sampling/sampling_logp_difference/max": 2.139009475708008, "sampling/sampling_logp_difference/mean": 0.2944427728652954, "step": 151, "step_time": 13.542915845988318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0545874908566475, "epoch": 0.00304, "grad_norm": 0.13164789974689484, "kl": 1.8217458575963974, "learning_rate": 7.999980065260663e-06, "loss": -0.0568, "step": 152, "step_time": 7.381949166971026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 3.379310369491577, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.16631481051445, "epoch": 0.00306, "frac_reward_zero_std": 0.375, "grad_norm": 0.13061584532260895, "kl": 0.5816949233412743, "learning_rate": 7.999979720077173e-06, "loss": -0.067, "num_tokens": 3180238.0, "reward": 0.7907090187072754, "reward_std": 0.7369586229324341, "rewards/rollout_reward_func/mean": 0.7907090187072754, "rewards/rollout_reward_func/std": 0.7369586229324341, "sampling/importance_sampling_ratio/max": 2.1261603832244873, "sampling/importance_sampling_ratio/mean": 1.0047863721847534, "sampling/importance_sampling_ratio/min": 5.50809318156098e-08, "sampling/sampling_logp_difference/max": 2.086775302886963, "sampling/sampling_logp_difference/mean": 0.31384116411209106, "step": 153, "step_time": 13.67310222503147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1575913056731224, "epoch": 0.00308, "grad_norm": 0.12478004395961761, "kl": 0.6438077762722969, "learning_rate": 7.99997937193075e-06, "loss": -0.0672, "step": 154, "step_time": 7.3583280890015885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.068965435028076, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.641471303999424, "epoch": 0.0031, "frac_reward_zero_std": 0.5, "grad_norm": 0.12696267664432526, "kl": 0.35288169234991074, "learning_rate": 7.99997902082139e-06, "loss": -0.0285, "num_tokens": 3220203.0, "reward": 0.8656240701675415, "reward_std": 0.6919684410095215, "rewards/rollout_reward_func/mean": 0.8656240701675415, "rewards/rollout_reward_func/std": 0.6919683814048767, "sampling/importance_sampling_ratio/max": 1.6436264514923096, "sampling/importance_sampling_ratio/mean": 0.9866600036621094, "sampling/importance_sampling_ratio/min": 6.35815879945767e-08, "sampling/sampling_logp_difference/max": 2.0103492736816406, "sampling/sampling_logp_difference/mean": 0.3799123764038086, "step": 155, "step_time": 13.378904525015969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6392805129289627, "epoch": 0.00312, "grad_norm": 0.12371685355901718, "kl": 0.3664514124393463, "learning_rate": 7.999978666749097e-06, "loss": -0.0289, "step": 156, "step_time": 7.353855350986123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.03125, "completions/mean_terminated_length": 3.6451611518859863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7069642245769501, "epoch": 0.00314, "frac_reward_zero_std": 0.375, "grad_norm": 0.26631683111190796, "kl": 0.5304361656308174, "learning_rate": 7.99997830971387e-06, "loss": -0.0006, "num_tokens": 3262936.0, "reward": 0.7041438817977905, "reward_std": 0.5910683870315552, "rewards/rollout_reward_func/mean": 0.7041438817977905, "rewards/rollout_reward_func/std": 0.5910683870315552, "sampling/importance_sampling_ratio/max": 1.6349728107452393, "sampling/importance_sampling_ratio/mean": 0.9402143955230713, "sampling/importance_sampling_ratio/min": 0.00033975893165916204, "sampling/sampling_logp_difference/max": 2.447150230407715, "sampling/sampling_logp_difference/mean": 0.21192115545272827, "step": 157, "step_time": 13.181810795940692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7009778246283531, "epoch": 0.00316, "grad_norm": 0.27206453680992126, "kl": 0.546476311981678, "learning_rate": 7.999977949715709e-06, "loss": -0.0009, "step": 158, "step_time": 7.376753899996402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.125, "completions/mean_terminated_length": 3.7419352531433105, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7985552661120892, "epoch": 0.00318, "frac_reward_zero_std": 0.375, "grad_norm": 0.21483363211154938, "kl": 1.4836581200361252, "learning_rate": 7.999977586754615e-06, "loss": -0.0441, "num_tokens": 3303265.0, "reward": 0.9910327196121216, "reward_std": 0.5542011857032776, "rewards/rollout_reward_func/mean": 0.9910327196121216, "rewards/rollout_reward_func/std": 0.5542011857032776, "sampling/importance_sampling_ratio/max": 2.1620497703552246, "sampling/importance_sampling_ratio/mean": 0.9734662175178528, "sampling/importance_sampling_ratio/min": 2.845093604264548e-06, "sampling/sampling_logp_difference/max": 3.6989941596984863, "sampling/sampling_logp_difference/mean": 0.3075576722621918, "step": 159, "step_time": 13.04753549900488 }, { "clip_ratio/high_max": 0.030555556528270245, "clip_ratio/high_mean": 0.015277778264135122, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015277778264135122, "entropy": 0.8073850236833096, "epoch": 0.0032, "grad_norm": 0.1573040634393692, "kl": 1.4409708082675934, "learning_rate": 7.999977220830588e-06, "loss": -0.0443, "step": 160, "step_time": 8.021316843980458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7640041559934616, "epoch": 0.00322, "frac_reward_zero_std": 0.25, "grad_norm": 0.19169802963733673, "kl": 0.6079578250646591, "learning_rate": 7.999976851943628e-06, "loss": -0.0695, "num_tokens": 3345828.0, "reward": 0.6872769594192505, "reward_std": 0.5801010131835938, "rewards/rollout_reward_func/mean": 0.6872769594192505, "rewards/rollout_reward_func/std": 0.580100953578949, "sampling/importance_sampling_ratio/max": 2.5295310020446777, "sampling/importance_sampling_ratio/mean": 0.9438614249229431, "sampling/importance_sampling_ratio/min": 3.2580508957380516e-08, "sampling/sampling_logp_difference/max": 1.895420789718628, "sampling/sampling_logp_difference/mean": 0.39230573177337646, "step": 161, "step_time": 13.666974832012784 }, { "clip_ratio/high_max": 0.02729528583586216, "clip_ratio/high_mean": 0.01364764291793108, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01364764291793108, "entropy": 1.7740522921085358, "epoch": 0.00324, "grad_norm": 0.12487076222896576, "kl": 0.5756440833210945, "learning_rate": 7.999976480093737e-06, "loss": -0.0699, "step": 162, "step_time": 8.136318991018925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 3.71875, "completions/mean_terminated_length": 3.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6341586709022522, "epoch": 0.00326, "frac_reward_zero_std": 0.625, "grad_norm": 0.04305378720164299, "kl": 0.6359695568680763, "learning_rate": 7.999976105280914e-06, "loss": -0.0264, "num_tokens": 3388712.0, "reward": 0.8598195314407349, "reward_std": 0.30350786447525024, "rewards/rollout_reward_func/mean": 0.8598195314407349, "rewards/rollout_reward_func/std": 0.30350783467292786, "sampling/importance_sampling_ratio/max": 1.629688024520874, "sampling/importance_sampling_ratio/mean": 1.039099931716919, "sampling/importance_sampling_ratio/min": 0.0014685924397781491, "sampling/sampling_logp_difference/max": 2.2624824047088623, "sampling/sampling_logp_difference/mean": 0.19128936529159546, "step": 163, "step_time": 13.149317554023582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.640811949968338, "epoch": 0.00328, "grad_norm": 0.03973911330103874, "kl": 0.5903862044215202, "learning_rate": 7.99997572750516e-06, "loss": -0.0264, "step": 164, "step_time": 7.3730269389925525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.3125, "completions/mean_terminated_length": 4.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2656882256269455, "epoch": 0.0033, "frac_reward_zero_std": 0.375, "grad_norm": 0.08017738163471222, "kl": 0.8366366028785706, "learning_rate": 7.999975346766472e-06, "loss": -0.0264, "num_tokens": 3429706.0, "reward": 0.9470285177230835, "reward_std": 0.4469969868659973, "rewards/rollout_reward_func/mean": 0.9470285177230835, "rewards/rollout_reward_func/std": 0.4469969868659973, "sampling/importance_sampling_ratio/max": 2.6679983139038086, "sampling/importance_sampling_ratio/mean": 0.9792197942733765, "sampling/importance_sampling_ratio/min": 1.9819352019112557e-05, "sampling/sampling_logp_difference/max": 1.8672611713409424, "sampling/sampling_logp_difference/mean": 0.3145017921924591, "step": 165, "step_time": 13.174546527967323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2750265300273895, "epoch": 0.00332, "grad_norm": 0.07776640355587006, "kl": 0.8338227421045303, "learning_rate": 7.999974963064855e-06, "loss": -0.0262, "step": 166, "step_time": 7.37868692498887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 3.8620688915252686, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0692923069000244, "epoch": 0.00334, "frac_reward_zero_std": 0.375, "grad_norm": 0.2543671429157257, "kl": 0.9716434255242348, "learning_rate": 7.999974576400308e-06, "loss": -0.0323, "num_tokens": 3470414.0, "reward": 0.5100732445716858, "reward_std": 0.7571796178817749, "rewards/rollout_reward_func/mean": 0.5100732445716858, "rewards/rollout_reward_func/std": 0.7571795582771301, "sampling/importance_sampling_ratio/max": 1.5014803409576416, "sampling/importance_sampling_ratio/mean": 0.9422976970672607, "sampling/importance_sampling_ratio/min": 1.2011435046588304e-06, "sampling/sampling_logp_difference/max": 1.9712791442871094, "sampling/sampling_logp_difference/mean": 0.24268504977226257, "step": 167, "step_time": 13.372999849030748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0770593285560608, "epoch": 0.00336, "grad_norm": 0.26469337940216064, "kl": 0.8855393379926682, "learning_rate": 7.999974186772832e-06, "loss": -0.0336, "step": 168, "step_time": 7.335124856006587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.28125, "completions/mean_terminated_length": 4.1724138259887695, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.781124323606491, "epoch": 0.00338, "frac_reward_zero_std": 0.125, "grad_norm": 0.07939980924129486, "kl": 0.7253769710659981, "learning_rate": 7.999973794182426e-06, "loss": -0.0701, "num_tokens": 3511929.0, "reward": 0.8666316270828247, "reward_std": 0.6186143159866333, "rewards/rollout_reward_func/mean": 0.8666316270828247, "rewards/rollout_reward_func/std": 0.6186143755912781, "sampling/importance_sampling_ratio/max": 1.333914041519165, "sampling/importance_sampling_ratio/mean": 0.8858327865600586, "sampling/importance_sampling_ratio/min": 2.1133453742550046e-07, "sampling/sampling_logp_difference/max": 2.4222793579101562, "sampling/sampling_logp_difference/mean": 0.38785797357559204, "step": 169, "step_time": 13.43598685602774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7913196980953217, "epoch": 0.0034, "grad_norm": 0.08030654489994049, "kl": 0.6781406104564667, "learning_rate": 7.99997339862909e-06, "loss": -0.0702, "step": 170, "step_time": 8.091144042962696 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9116668850183487, "epoch": 0.00342, "frac_reward_zero_std": 0.125, "grad_norm": 0.2092408686876297, "kl": 0.487033873796463, "learning_rate": 7.999973000112826e-06, "loss": -0.0681, "num_tokens": 3554444.0, "reward": 0.575301468372345, "reward_std": 0.6878501176834106, "rewards/rollout_reward_func/mean": 0.575301468372345, "rewards/rollout_reward_func/std": 0.6878500580787659, "sampling/importance_sampling_ratio/max": 1.687791347503662, "sampling/importance_sampling_ratio/mean": 0.8419430255889893, "sampling/importance_sampling_ratio/min": 1.3466308246279368e-07, "sampling/sampling_logp_difference/max": 3.012042760848999, "sampling/sampling_logp_difference/mean": 0.38707733154296875, "step": 171, "step_time": 13.424593357020058 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.005434782709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012379227206110954, "entropy": 1.9155390709638596, "epoch": 0.00344, "grad_norm": 0.12180078029632568, "kl": 0.4777339994907379, "learning_rate": 7.999972598633632e-06, "loss": -0.0685, "step": 172, "step_time": 8.076039884996135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 3.8965516090393066, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4444396048784256, "epoch": 0.00346, "frac_reward_zero_std": 0.375, "grad_norm": 0.08408041298389435, "kl": 0.5472258031368256, "learning_rate": 7.999972194191514e-06, "loss": -0.0481, "num_tokens": 3593723.0, "reward": 0.8370929956436157, "reward_std": 0.7010805606842041, "rewards/rollout_reward_func/mean": 0.8370929956436157, "rewards/rollout_reward_func/std": 0.7010805606842041, "sampling/importance_sampling_ratio/max": 1.5014570951461792, "sampling/importance_sampling_ratio/mean": 0.8865096569061279, "sampling/importance_sampling_ratio/min": 7.709008431611153e-10, "sampling/sampling_logp_difference/max": 2.3534250259399414, "sampling/sampling_logp_difference/mean": 0.38450753688812256, "step": 173, "step_time": 13.385580102010863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4496548771858215, "epoch": 0.00348, "grad_norm": 0.08553479611873627, "kl": 0.5279839932918549, "learning_rate": 7.999971786786465e-06, "loss": -0.0482, "step": 174, "step_time": 7.346707860007882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.709677219390869, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4871947094798088, "epoch": 0.0035, "frac_reward_zero_std": 0.375, "grad_norm": 0.06495398283004761, "kl": 0.5374166555702686, "learning_rate": 7.99997137641849e-06, "loss": -0.0571, "num_tokens": 3633188.0, "reward": 0.9011430740356445, "reward_std": 0.6213991641998291, "rewards/rollout_reward_func/mean": 0.9011430740356445, "rewards/rollout_reward_func/std": 0.6213991641998291, "sampling/importance_sampling_ratio/max": 1.6707725524902344, "sampling/importance_sampling_ratio/mean": 0.9220820665359497, "sampling/importance_sampling_ratio/min": 1.0416396634127523e-08, "sampling/sampling_logp_difference/max": 2.616302728652954, "sampling/sampling_logp_difference/mean": 0.3611287474632263, "step": 175, "step_time": 13.272737861028872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4953911304473877, "epoch": 0.00352, "grad_norm": 0.06583996117115021, "kl": 0.5452149584889412, "learning_rate": 7.999970963087587e-06, "loss": -0.0571, "step": 176, "step_time": 7.364386145985918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.137930870056152, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5511359423398972, "epoch": 0.00354, "frac_reward_zero_std": 0.125, "grad_norm": 0.18906579911708832, "kl": 0.48357143253088, "learning_rate": 7.99997054679376e-06, "loss": -0.0447, "num_tokens": 3671968.0, "reward": 0.8083615303039551, "reward_std": 0.7523511052131653, "rewards/rollout_reward_func/mean": 0.8083615303039551, "rewards/rollout_reward_func/std": 0.7523511052131653, "sampling/importance_sampling_ratio/max": 1.4444966316223145, "sampling/importance_sampling_ratio/mean": 0.8668118715286255, "sampling/importance_sampling_ratio/min": 1.3999560621869023e-08, "sampling/sampling_logp_difference/max": 1.8861876726150513, "sampling/sampling_logp_difference/mean": 0.35161739587783813, "step": 177, "step_time": 13.31176350399619 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.5426936894655228, "epoch": 0.00356, "grad_norm": 0.1412348598241806, "kl": 0.48015454411506653, "learning_rate": 7.999970127537005e-06, "loss": -0.0447, "step": 178, "step_time": 7.367456412030151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.387096405029297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.351994514465332, "epoch": 0.00358, "frac_reward_zero_std": 0.25, "grad_norm": 0.09150045365095139, "kl": 0.7278873771429062, "learning_rate": 7.999969705317325e-06, "loss": -0.0497, "num_tokens": 3714315.0, "reward": 0.646558403968811, "reward_std": 0.6613664031028748, "rewards/rollout_reward_func/mean": 0.646558403968811, "rewards/rollout_reward_func/std": 0.6613664031028748, "sampling/importance_sampling_ratio/max": 1.5187245607376099, "sampling/importance_sampling_ratio/mean": 0.8719314336776733, "sampling/importance_sampling_ratio/min": 7.675842539356381e-08, "sampling/sampling_logp_difference/max": 2.132737636566162, "sampling/sampling_logp_difference/mean": 0.30357179045677185, "step": 179, "step_time": 13.44879229401704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3491946160793304, "epoch": 0.0036, "grad_norm": 0.09354283660650253, "kl": 0.7311538457870483, "learning_rate": 7.99996928013472e-06, "loss": -0.0496, "step": 180, "step_time": 8.352474010986043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5614704489707947, "epoch": 0.00362, "frac_reward_zero_std": 0.25, "grad_norm": 0.08784753084182739, "kl": 0.7032561898231506, "learning_rate": 7.999968851989192e-06, "loss": -0.0455, "num_tokens": 3754895.0, "reward": 0.6712503433227539, "reward_std": 0.8031108975410461, "rewards/rollout_reward_func/mean": 0.6712503433227539, "rewards/rollout_reward_func/std": 0.8031108975410461, "sampling/importance_sampling_ratio/max": 1.3819681406021118, "sampling/importance_sampling_ratio/mean": 0.709526777267456, "sampling/importance_sampling_ratio/min": 2.8278171471107783e-11, "sampling/sampling_logp_difference/max": 2.9196736812591553, "sampling/sampling_logp_difference/mean": 0.5476837754249573, "step": 181, "step_time": 13.568661386991153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.559103488922119, "epoch": 0.00364, "grad_norm": 0.09220090508460999, "kl": 0.7214163392782211, "learning_rate": 7.999968420880736e-06, "loss": -0.0459, "step": 182, "step_time": 7.955302731017582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 4.096774101257324, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1890787407755852, "epoch": 0.00366, "frac_reward_zero_std": 0.25, "grad_norm": 0.11672062426805496, "kl": 0.8290963619947433, "learning_rate": 7.99996798680936e-06, "loss": -0.0528, "num_tokens": 3794301.0, "reward": 0.7540483474731445, "reward_std": 0.8240371346473694, "rewards/rollout_reward_func/mean": 0.7540483474731445, "rewards/rollout_reward_func/std": 0.8240371346473694, "sampling/importance_sampling_ratio/max": 1.44930899143219, "sampling/importance_sampling_ratio/mean": 0.9053283929824829, "sampling/importance_sampling_ratio/min": 2.7491733334272794e-09, "sampling/sampling_logp_difference/max": 2.2532095909118652, "sampling/sampling_logp_difference/mean": 0.33568960428237915, "step": 183, "step_time": 13.119284423970385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1855115741491318, "epoch": 0.00368, "grad_norm": 0.11994508653879166, "kl": 0.8238307684659958, "learning_rate": 7.999967549775057e-06, "loss": -0.0532, "step": 184, "step_time": 7.3269670179870445 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 3.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5991080030798912, "epoch": 0.0037, "frac_reward_zero_std": 0.375, "grad_norm": 0.3029656708240509, "kl": 0.7305047661066055, "learning_rate": 7.999967109777834e-06, "loss": -0.0498, "num_tokens": 3831233.0, "reward": 0.9180835485458374, "reward_std": 0.8857027292251587, "rewards/rollout_reward_func/mean": 0.9180835485458374, "rewards/rollout_reward_func/std": 0.8857027292251587, "sampling/importance_sampling_ratio/max": 1.5391758680343628, "sampling/importance_sampling_ratio/mean": 0.942392110824585, "sampling/importance_sampling_ratio/min": 9.27930088323592e-09, "sampling/sampling_logp_difference/max": 2.275027275085449, "sampling/sampling_logp_difference/mean": 0.33675286173820496, "step": 185, "step_time": 13.148553191975225 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01666666753590107, "entropy": 1.592967838048935, "epoch": 0.00372, "grad_norm": 0.0522928386926651, "kl": 0.7153053730726242, "learning_rate": 7.999966666817687e-06, "loss": -0.0507, "step": 186, "step_time": 7.2884532490279526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.1875, "completions/mean_terminated_length": 3.8064515590667725, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9091096296906471, "epoch": 0.00374, "frac_reward_zero_std": 0.375, "grad_norm": 0.22652484476566315, "kl": 0.36620641499757767, "learning_rate": 7.999966220894617e-06, "loss": -0.0191, "num_tokens": 3873499.0, "reward": 0.5607228875160217, "reward_std": 0.6929969787597656, "rewards/rollout_reward_func/mean": 0.5607228875160217, "rewards/rollout_reward_func/std": 0.6929969787597656, "sampling/importance_sampling_ratio/max": 1.6803570985794067, "sampling/importance_sampling_ratio/mean": 1.0094106197357178, "sampling/importance_sampling_ratio/min": 1.5750340480735758e-06, "sampling/sampling_logp_difference/max": 1.6263715028762817, "sampling/sampling_logp_difference/mean": 0.2256474494934082, "step": 187, "step_time": 13.200030180974863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9128739237785339, "epoch": 0.00376, "grad_norm": 0.23419103026390076, "kl": 0.3669833689928055, "learning_rate": 7.999965772008627e-06, "loss": -0.0197, "step": 188, "step_time": 7.3486230459820945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5046539455652237, "epoch": 0.00378, "frac_reward_zero_std": 0.125, "grad_norm": 0.18279901146888733, "kl": 0.3912477046251297, "learning_rate": 7.999965320159715e-06, "loss": -0.06, "num_tokens": 3915907.0, "reward": 0.7646108865737915, "reward_std": 0.6228224039077759, "rewards/rollout_reward_func/mean": 0.7646108865737915, "rewards/rollout_reward_func/std": 0.6228224039077759, "sampling/importance_sampling_ratio/max": 1.511399507522583, "sampling/importance_sampling_ratio/mean": 0.8522161245346069, "sampling/importance_sampling_ratio/min": 1.1897361673618434e-06, "sampling/sampling_logp_difference/max": 2.049565553665161, "sampling/sampling_logp_difference/mean": 0.36509114503860474, "step": 189, "step_time": 13.393189770984463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5005701035261154, "epoch": 0.0038, "grad_norm": 0.1776597797870636, "kl": 0.4138626977801323, "learning_rate": 7.999964865347883e-06, "loss": -0.0601, "step": 190, "step_time": 7.803228005999699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.03125, "completions/mean_terminated_length": 3.8965516090393066, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3941352367401123, "epoch": 0.00382, "frac_reward_zero_std": 0.25, "grad_norm": 0.22454583644866943, "kl": 0.49800530076026917, "learning_rate": 7.999964407573131e-06, "loss": -0.048, "num_tokens": 3957087.0, "reward": 0.5861400365829468, "reward_std": 0.6593859791755676, "rewards/rollout_reward_func/mean": 0.5861400365829468, "rewards/rollout_reward_func/std": 0.6593859791755676, "sampling/importance_sampling_ratio/max": 1.4407349824905396, "sampling/importance_sampling_ratio/mean": 0.8975809812545776, "sampling/importance_sampling_ratio/min": 6.151083198346896e-06, "sampling/sampling_logp_difference/max": 2.8267312049865723, "sampling/sampling_logp_difference/mean": 0.3081477880477905, "step": 191, "step_time": 13.842904330027523 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01785714365541935, "entropy": 1.3780954331159592, "epoch": 0.00384, "grad_norm": 0.12557366490364075, "kl": 0.5030355677008629, "learning_rate": 7.999963946835458e-06, "loss": -0.0487, "step": 192, "step_time": 7.431009833002463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.008666217327118, "epoch": 0.00386, "frac_reward_zero_std": 0.125, "grad_norm": 0.47509515285491943, "kl": 0.19056814908981323, "learning_rate": 7.999963483134866e-06, "loss": -0.068, "num_tokens": 4004710.0, "reward": 0.5023146867752075, "reward_std": 0.8096634149551392, "rewards/rollout_reward_func/mean": 0.5023146867752075, "rewards/rollout_reward_func/std": 0.8096634149551392, "sampling/importance_sampling_ratio/max": 2.8013057708740234, "sampling/importance_sampling_ratio/mean": 0.895485520362854, "sampling/importance_sampling_ratio/min": 1.5457061408596928e-06, "sampling/sampling_logp_difference/max": 2.1325716972351074, "sampling/sampling_logp_difference/mean": 0.34611380100250244, "step": 193, "step_time": 19.859709551994456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.9884281903505325, "epoch": 0.00388, "grad_norm": 0.23440833389759064, "kl": 0.20012407004833221, "learning_rate": 7.999963016471355e-06, "loss": -0.0717, "step": 194, "step_time": 10.47234369898797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9294780790805817, "epoch": 0.0039, "frac_reward_zero_std": 0.125, "grad_norm": 0.14765135943889618, "kl": 0.9063562154769897, "learning_rate": 7.999962546844924e-06, "loss": -0.0462, "num_tokens": 4055200.0, "reward": 0.5871508121490479, "reward_std": 0.773273766040802, "rewards/rollout_reward_func/mean": 0.5871508121490479, "rewards/rollout_reward_func/std": 0.773273766040802, "sampling/importance_sampling_ratio/max": 2.489835023880005, "sampling/importance_sampling_ratio/mean": 0.7764290571212769, "sampling/importance_sampling_ratio/min": 1.2649767455741312e-08, "sampling/sampling_logp_difference/max": 2.5036892890930176, "sampling/sampling_logp_difference/mean": 0.3896065056324005, "step": 195, "step_time": 18.91470408299938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.9092100262641907, "epoch": 0.00392, "grad_norm": 0.14290809631347656, "kl": 1.0714601427316666, "learning_rate": 7.999962074255578e-06, "loss": -0.0467, "step": 196, "step_time": 10.30337335797958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 5.079999923706055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.312853455543518, "epoch": 0.00394, "frac_reward_zero_std": 0.125, "grad_norm": 0.14406464993953705, "kl": 0.2198803387582302, "learning_rate": 7.999961598703312e-06, "loss": -0.0812, "num_tokens": 4104508.0, "reward": 0.5962635278701782, "reward_std": 0.8179559111595154, "rewards/rollout_reward_func/mean": 0.5962635278701782, "rewards/rollout_reward_func/std": 0.8179559111595154, "sampling/importance_sampling_ratio/max": 1.8901411294937134, "sampling/importance_sampling_ratio/mean": 0.7391132116317749, "sampling/importance_sampling_ratio/min": 5.525117785509792e-08, "sampling/sampling_logp_difference/max": 2.2401914596557617, "sampling/sampling_logp_difference/mean": 0.4254765510559082, "step": 197, "step_time": 19.291614877962274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018464575987309217, "clip_ratio/low_min": 0.0062500000931322575, "clip_ratio/region_mean": 0.018464575987309217, "entropy": 2.3030026257038116, "epoch": 0.00396, "grad_norm": 0.10856854170560837, "kl": 0.25889239087700844, "learning_rate": 7.99996112018813e-06, "loss": -0.0822, "step": 198, "step_time": 10.237059728009626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6442953199148178, "epoch": 0.00398, "frac_reward_zero_std": 0.125, "grad_norm": 0.19805808365345, "kl": 0.5091748535633087, "learning_rate": 7.999960638710032e-06, "loss": -0.0419, "num_tokens": 4158389.0, "reward": 0.6696329116821289, "reward_std": 0.7860524654388428, "rewards/rollout_reward_func/mean": 0.6696329116821289, "rewards/rollout_reward_func/std": 0.7860524654388428, "sampling/importance_sampling_ratio/max": 2.6637518405914307, "sampling/importance_sampling_ratio/mean": 0.8626466989517212, "sampling/importance_sampling_ratio/min": 5.495537607203005e-06, "sampling/sampling_logp_difference/max": 3.5910470485687256, "sampling/sampling_logp_difference/mean": 0.33805492520332336, "step": 199, "step_time": 20.198439399013296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007305195089429617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007305195089429617, "entropy": 1.6066623032093048, "epoch": 0.004, "grad_norm": 0.16025741398334503, "kl": 0.5134688392281532, "learning_rate": 7.999960154269017e-06, "loss": -0.0429, "step": 200, "step_time": 10.36067363602342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 5.4166669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3633706867694855, "epoch": 0.00402, "frac_reward_zero_std": 0.125, "grad_norm": 0.14198699593544006, "kl": 1.4121363162994385, "learning_rate": 7.999959666865086e-06, "loss": -0.0591, "num_tokens": 4214697.0, "reward": 0.22793127596378326, "reward_std": 0.5076944231987, "rewards/rollout_reward_func/mean": 0.22793127596378326, "rewards/rollout_reward_func/std": 0.5076944231987, "sampling/importance_sampling_ratio/max": 2.3811168670654297, "sampling/importance_sampling_ratio/mean": 0.46382516622543335, "sampling/importance_sampling_ratio/min": 1.066164490026722e-08, "sampling/sampling_logp_difference/max": 3.5236082077026367, "sampling/sampling_logp_difference/mean": 0.589319109916687, "step": 201, "step_time": 20.447796414024197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013257576385512948, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013257576385512948, "entropy": 2.3499532341957092, "epoch": 0.00404, "grad_norm": 0.17481614649295807, "kl": 1.6949417293071747, "learning_rate": 7.99995917649824e-06, "loss": -0.0583, "step": 202, "step_time": 10.502768271020614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2325398474931717, "epoch": 0.00406, "frac_reward_zero_std": 0.375, "grad_norm": 0.08437079936265945, "kl": 0.27968743816018105, "learning_rate": 7.999958683168479e-06, "loss": -0.0047, "num_tokens": 4264069.0, "reward": 0.786902904510498, "reward_std": 0.7106658220291138, "rewards/rollout_reward_func/mean": 0.786902904510498, "rewards/rollout_reward_func/std": 0.7106658220291138, "sampling/importance_sampling_ratio/max": 2.3834831714630127, "sampling/importance_sampling_ratio/mean": 0.9747942686080933, "sampling/importance_sampling_ratio/min": 3.470091485269222e-07, "sampling/sampling_logp_difference/max": 2.255671977996826, "sampling/sampling_logp_difference/mean": 0.22411856055259705, "step": 203, "step_time": 19.536794965009904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2276110351085663, "epoch": 0.00408, "grad_norm": 0.08438724279403687, "kl": 0.2830059081315994, "learning_rate": 7.999958186875805e-06, "loss": -0.0048, "step": 204, "step_time": 10.130022457975429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.250650316476822, "epoch": 0.0041, "frac_reward_zero_std": 0.125, "grad_norm": 0.14332474768161774, "kl": 0.475493960082531, "learning_rate": 7.999957687620215e-06, "loss": -0.07, "num_tokens": 4317225.0, "reward": 0.895460307598114, "reward_std": 0.6736186146736145, "rewards/rollout_reward_func/mean": 0.895460307598114, "rewards/rollout_reward_func/std": 0.6736186742782593, "sampling/importance_sampling_ratio/max": 1.901850938796997, "sampling/importance_sampling_ratio/mean": 1.0003588199615479, "sampling/importance_sampling_ratio/min": 1.7091957715820172e-06, "sampling/sampling_logp_difference/max": 3.3753082752227783, "sampling/sampling_logp_difference/mean": 0.3183167576789856, "step": 205, "step_time": 20.493346692994237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.24564977735281, "epoch": 0.00412, "grad_norm": 0.14257045090198517, "kl": 0.4886472374200821, "learning_rate": 7.999957185401714e-06, "loss": -0.0706, "step": 206, "step_time": 10.77078788203653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.53125, "completions/mean_terminated_length": 4.161290168762207, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7850259318947792, "epoch": 0.00414, "frac_reward_zero_std": 0.125, "grad_norm": 0.12926267087459564, "kl": 1.1749890968203545, "learning_rate": 7.9999566802203e-06, "loss": -0.0554, "num_tokens": 4367893.0, "reward": 0.6718871593475342, "reward_std": 0.7603933811187744, "rewards/rollout_reward_func/mean": 0.6718871593475342, "rewards/rollout_reward_func/std": 0.7603934407234192, "sampling/importance_sampling_ratio/max": 1.7392818927764893, "sampling/importance_sampling_ratio/mean": 1.0531036853790283, "sampling/importance_sampling_ratio/min": 0.0002180065494030714, "sampling/sampling_logp_difference/max": 2.310840129852295, "sampling/sampling_logp_difference/mean": 0.21498985588550568, "step": 207, "step_time": 18.730757941986667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7718899622559547, "epoch": 0.00416, "grad_norm": 0.12880878150463104, "kl": 1.1780114695429802, "learning_rate": 7.999956172075974e-06, "loss": -0.0558, "step": 208, "step_time": 10.238878122996539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3017520904541016, "epoch": 0.00418, "frac_reward_zero_std": 0.125, "grad_norm": 0.14446108043193817, "kl": 0.6079902425408363, "learning_rate": 7.999955660968735e-06, "loss": -0.0792, "num_tokens": 4420033.0, "reward": 0.6342819333076477, "reward_std": 0.6713907122612, "rewards/rollout_reward_func/mean": 0.6342819333076477, "rewards/rollout_reward_func/std": 0.6713906526565552, "sampling/importance_sampling_ratio/max": 1.7755168676376343, "sampling/importance_sampling_ratio/mean": 0.9550551176071167, "sampling/importance_sampling_ratio/min": 0.00013590506569016725, "sampling/sampling_logp_difference/max": 2.691565752029419, "sampling/sampling_logp_difference/mean": 0.30557265877723694, "step": 209, "step_time": 19.463991702970816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2901688069105148, "epoch": 0.0042, "grad_norm": 0.14759834110736847, "kl": 0.6393308490514755, "learning_rate": 7.999955146898586e-06, "loss": -0.0794, "step": 210, "step_time": 10.816957039001863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.275862216949463, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.134396344423294, "epoch": 0.00422, "frac_reward_zero_std": 0.0, "grad_norm": 0.17698553204536438, "kl": 1.0594890415668488, "learning_rate": 7.999954629865525e-06, "loss": -0.056, "num_tokens": 4475615.0, "reward": 0.23131786286830902, "reward_std": 0.6855910420417786, "rewards/rollout_reward_func/mean": 0.23131786286830902, "rewards/rollout_reward_func/std": 0.6855909824371338, "sampling/importance_sampling_ratio/max": 2.3930375576019287, "sampling/importance_sampling_ratio/mean": 0.8904502391815186, "sampling/importance_sampling_ratio/min": 2.704018152144272e-05, "sampling/sampling_logp_difference/max": 2.60434627532959, "sampling/sampling_logp_difference/mean": 0.2681208848953247, "step": 211, "step_time": 19.449141403019894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1282896995544434, "epoch": 0.00424, "grad_norm": 0.18275600671768188, "kl": 1.0311454981565475, "learning_rate": 7.999954109869554e-06, "loss": -0.0567, "step": 212, "step_time": 10.456656957016094 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1959877610206604, "epoch": 0.00426, "frac_reward_zero_std": 0.125, "grad_norm": 0.1488141119480133, "kl": 1.3233310878276825, "learning_rate": 7.999953586910674e-06, "loss": -0.0095, "num_tokens": 4523905.0, "reward": 0.7539244890213013, "reward_std": 0.7512176036834717, "rewards/rollout_reward_func/mean": 0.7539244890213013, "rewards/rollout_reward_func/std": 0.7512176632881165, "sampling/importance_sampling_ratio/max": 1.506996989250183, "sampling/importance_sampling_ratio/mean": 0.791650116443634, "sampling/importance_sampling_ratio/min": 6.462191322498256e-06, "sampling/sampling_logp_difference/max": 1.9949216842651367, "sampling/sampling_logp_difference/mean": 0.3205275535583496, "step": 213, "step_time": 19.084921025001677 }, { "clip_ratio/high_max": 0.03345704032108188, "clip_ratio/high_mean": 0.01672852016054094, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01672852016054094, "entropy": 1.2111055254936218, "epoch": 0.00428, "grad_norm": 0.13576924800872803, "kl": 1.2142965644598007, "learning_rate": 7.999953060988884e-06, "loss": -0.0102, "step": 214, "step_time": 10.393377058964688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 5.22580623626709, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4802465438842773, "epoch": 0.0043, "frac_reward_zero_std": 0.125, "grad_norm": 0.08675063401460648, "kl": 0.9460023045539856, "learning_rate": 7.999952532104185e-06, "loss": -0.0628, "num_tokens": 4579878.0, "reward": 0.7587811946868896, "reward_std": 0.6677835583686829, "rewards/rollout_reward_func/mean": 0.7587811946868896, "rewards/rollout_reward_func/std": 0.6677834987640381, "sampling/importance_sampling_ratio/max": 2.931607484817505, "sampling/importance_sampling_ratio/mean": 0.9424468278884888, "sampling/importance_sampling_ratio/min": 1.4987378449404787e-07, "sampling/sampling_logp_difference/max": 2.060391426086426, "sampling/sampling_logp_difference/mean": 0.356830894947052, "step": 215, "step_time": 19.387657660990953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4903254210948944, "epoch": 0.00432, "grad_norm": 0.09715536236763, "kl": 0.8149576932191849, "learning_rate": 7.99995200025658e-06, "loss": -0.0627, "step": 216, "step_time": 10.335150866972981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.464285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5065660774707794, "epoch": 0.00434, "frac_reward_zero_std": 0.125, "grad_norm": 0.13881060481071472, "kl": 0.7503993958234787, "learning_rate": 7.999951465446065e-06, "loss": -0.0819, "num_tokens": 4634977.0, "reward": 0.716366708278656, "reward_std": 0.7104789018630981, "rewards/rollout_reward_func/mean": 0.716366708278656, "rewards/rollout_reward_func/std": 0.7104788422584534, "sampling/importance_sampling_ratio/max": 1.6915868520736694, "sampling/importance_sampling_ratio/mean": 0.9086317420005798, "sampling/importance_sampling_ratio/min": 9.14068664314982e-07, "sampling/sampling_logp_difference/max": 2.3628976345062256, "sampling/sampling_logp_difference/mean": 0.3833477199077606, "step": 217, "step_time": 19.9022657769965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5114492177963257, "epoch": 0.00436, "grad_norm": 0.13927283883094788, "kl": 0.691320925951004, "learning_rate": 7.999950927672645e-06, "loss": -0.082, "step": 218, "step_time": 11.004902192013105 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 6.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2945502400398254, "epoch": 0.00438, "frac_reward_zero_std": 0.125, "grad_norm": 0.1386832296848297, "kl": 0.6575180143117905, "learning_rate": 7.999950386936317e-06, "loss": -0.0741, "num_tokens": 4690702.0, "reward": 0.5154240131378174, "reward_std": 0.6907124519348145, "rewards/rollout_reward_func/mean": 0.5154240131378174, "rewards/rollout_reward_func/std": 0.6907124519348145, "sampling/importance_sampling_ratio/max": 1.8079307079315186, "sampling/importance_sampling_ratio/mean": 0.6835378408432007, "sampling/importance_sampling_ratio/min": 3.793768854620794e-09, "sampling/sampling_logp_difference/max": 3.4504597187042236, "sampling/sampling_logp_difference/mean": 0.47897663712501526, "step": 219, "step_time": 20.48673896701075 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "entropy": 2.2889934182167053, "epoch": 0.0044, "grad_norm": 0.13355796039104462, "kl": 0.6278674900531769, "learning_rate": 7.999949843237083e-06, "loss": -0.0746, "step": 220, "step_time": 11.156342638016213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7134639620780945, "epoch": 0.00442, "frac_reward_zero_std": 0.125, "grad_norm": 0.08029253780841827, "kl": 0.5280777886509895, "learning_rate": 7.999949296574944e-06, "loss": -0.0937, "num_tokens": 4740648.0, "reward": 0.611752986907959, "reward_std": 0.8220751881599426, "rewards/rollout_reward_func/mean": 0.611752986907959, "rewards/rollout_reward_func/std": 0.8220752477645874, "sampling/importance_sampling_ratio/max": 2.120950222015381, "sampling/importance_sampling_ratio/mean": 0.8367366790771484, "sampling/importance_sampling_ratio/min": 5.1687494284635704e-09, "sampling/sampling_logp_difference/max": 2.361069440841675, "sampling/sampling_logp_difference/mean": 0.3507883548736572, "step": 221, "step_time": 19.198293335008202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7113716006278992, "epoch": 0.00444, "grad_norm": 0.080698162317276, "kl": 0.5316110178828239, "learning_rate": 7.9999487469499e-06, "loss": -0.0938, "step": 222, "step_time": 10.289292323985137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 5.375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5978971123695374, "epoch": 0.00446, "frac_reward_zero_std": 0.125, "grad_norm": 0.058583978563547134, "kl": 0.3151433765888214, "learning_rate": 7.999948194361951e-06, "loss": -0.0801, "num_tokens": 4792303.0, "reward": 0.4277728796005249, "reward_std": 0.8462741374969482, "rewards/rollout_reward_func/mean": 0.4277728796005249, "rewards/rollout_reward_func/std": 0.8462740778923035, "sampling/importance_sampling_ratio/max": 1.7857283353805542, "sampling/importance_sampling_ratio/mean": 0.7648119926452637, "sampling/importance_sampling_ratio/min": 3.610931642583637e-08, "sampling/sampling_logp_difference/max": 2.168161630630493, "sampling/sampling_logp_difference/mean": 0.435150146484375, "step": 223, "step_time": 19.81195802000002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5882712602615356, "epoch": 0.00448, "grad_norm": 0.060595013201236725, "kl": 0.31476064026355743, "learning_rate": 7.999947638811098e-06, "loss": -0.0804, "step": 224, "step_time": 10.341546151030343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.517274260520935, "epoch": 0.0045, "frac_reward_zero_std": 0.25, "grad_norm": 0.141582652926445, "kl": 0.5963888466358185, "learning_rate": 7.999947080297344e-06, "loss": -0.0403, "num_tokens": 4836462.0, "reward": 0.23226723074913025, "reward_std": 0.7982401251792908, "rewards/rollout_reward_func/mean": 0.23226723074913025, "rewards/rollout_reward_func/std": 0.798240065574646, "sampling/importance_sampling_ratio/max": 1.7266896963119507, "sampling/importance_sampling_ratio/mean": 0.8862825632095337, "sampling/importance_sampling_ratio/min": 2.386546157140401e-06, "sampling/sampling_logp_difference/max": 1.9903875589370728, "sampling/sampling_logp_difference/mean": 0.29696816205978394, "step": 225, "step_time": 18.781414410012076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.5127398073673248, "epoch": 0.00452, "grad_norm": 0.14280728995800018, "kl": 0.6050507910549641, "learning_rate": 7.999946518820686e-06, "loss": -0.0405, "step": 226, "step_time": 10.129890460986644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.857142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7129860892891884, "epoch": 0.00454, "frac_reward_zero_std": 0.125, "grad_norm": 0.08004257827997208, "kl": 0.9292117580771446, "learning_rate": 7.999945954381125e-06, "loss": -0.0702, "num_tokens": 4884267.0, "reward": 0.7805823683738708, "reward_std": 0.6008060574531555, "rewards/rollout_reward_func/mean": 0.7805823683738708, "rewards/rollout_reward_func/std": 0.6008059978485107, "sampling/importance_sampling_ratio/max": 1.9419777393341064, "sampling/importance_sampling_ratio/mean": 0.8694511651992798, "sampling/importance_sampling_ratio/min": 1.7142440356110455e-07, "sampling/sampling_logp_difference/max": 2.9937596321105957, "sampling/sampling_logp_difference/mean": 0.4184224605560303, "step": 227, "step_time": 19.88183453702368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7116919346153736, "epoch": 0.00456, "grad_norm": 0.08048371225595474, "kl": 0.8942985981702805, "learning_rate": 7.999945386978663e-06, "loss": -0.0706, "step": 228, "step_time": 11.423654997022822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.296296119689941, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5857596695423126, "epoch": 0.00458, "frac_reward_zero_std": 0.0, "grad_norm": 0.14748893678188324, "kl": 0.49177994206547737, "learning_rate": 7.999944816613299e-06, "loss": -0.0918, "num_tokens": 4942273.0, "reward": 0.4331611096858978, "reward_std": 0.7354938983917236, "rewards/rollout_reward_func/mean": 0.4331611096858978, "rewards/rollout_reward_func/std": 0.7354938983917236, "sampling/importance_sampling_ratio/max": 1.9407720565795898, "sampling/importance_sampling_ratio/mean": 0.892997682094574, "sampling/importance_sampling_ratio/min": 5.697438609786332e-06, "sampling/sampling_logp_difference/max": 2.3139376640319824, "sampling/sampling_logp_difference/mean": 0.33017268776893616, "step": 229, "step_time": 20.472552293998888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.57668137550354, "epoch": 0.0046, "grad_norm": 0.1454998254776001, "kl": 0.5063322372734547, "learning_rate": 7.999944243285035e-06, "loss": -0.0922, "step": 230, "step_time": 11.205414848955115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3235075436532497, "epoch": 0.00462, "frac_reward_zero_std": 0.125, "grad_norm": 0.03756793588399887, "kl": 0.3812147192656994, "learning_rate": 7.999943666993872e-06, "loss": -0.0664, "num_tokens": 4990269.0, "reward": 0.3905232548713684, "reward_std": 0.8673736453056335, "rewards/rollout_reward_func/mean": 0.3905232548713684, "rewards/rollout_reward_func/std": 0.8673736453056335, "sampling/importance_sampling_ratio/max": 1.6667112112045288, "sampling/importance_sampling_ratio/mean": 0.9201664328575134, "sampling/importance_sampling_ratio/min": 2.7236710593570024e-06, "sampling/sampling_logp_difference/max": 2.823099374771118, "sampling/sampling_logp_difference/mean": 0.2637081444263458, "step": 231, "step_time": 19.98226652003359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3154640011489391, "epoch": 0.00464, "grad_norm": 0.035604268312454224, "kl": 0.3763782009482384, "learning_rate": 7.999943087739808e-06, "loss": -0.0666, "step": 232, "step_time": 10.633489158994053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.118122100830078, "epoch": 0.00466, "frac_reward_zero_std": 0.125, "grad_norm": 0.10852320492267609, "kl": 0.5086382403969765, "learning_rate": 7.999942505522845e-06, "loss": -0.0416, "num_tokens": 5040965.0, "reward": 0.5077102780342102, "reward_std": 0.7476881742477417, "rewards/rollout_reward_func/mean": 0.5077102780342102, "rewards/rollout_reward_func/std": 0.7476881742477417, "sampling/importance_sampling_ratio/max": 1.5386544466018677, "sampling/importance_sampling_ratio/mean": 0.71016526222229, "sampling/importance_sampling_ratio/min": 3.389108655937889e-07, "sampling/sampling_logp_difference/max": 2.4978299140930176, "sampling/sampling_logp_difference/mean": 0.40769892930984497, "step": 233, "step_time": 19.991223464952782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.113588571548462, "epoch": 0.00468, "grad_norm": 0.11033590883016586, "kl": 0.5026055723428726, "learning_rate": 7.999941920342986e-06, "loss": -0.0417, "step": 234, "step_time": 10.727274738979759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 4.759999752044678, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.305933892726898, "epoch": 0.0047, "frac_reward_zero_std": 0.125, "grad_norm": 0.1703493893146515, "kl": 0.5226923823356628, "learning_rate": 7.999941332200228e-06, "loss": -0.0781, "num_tokens": 5095209.0, "reward": 0.5797775387763977, "reward_std": 0.7259615659713745, "rewards/rollout_reward_func/mean": 0.5797775387763977, "rewards/rollout_reward_func/std": 0.7259615659713745, "sampling/importance_sampling_ratio/max": 1.6668305397033691, "sampling/importance_sampling_ratio/mean": 0.6780193448066711, "sampling/importance_sampling_ratio/min": 4.998346980755741e-07, "sampling/sampling_logp_difference/max": 2.6483676433563232, "sampling/sampling_logp_difference/mean": 0.47222375869750977, "step": 235, "step_time": 19.948120530025335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3020027577877045, "epoch": 0.00472, "grad_norm": 0.1450866311788559, "kl": 0.5262585133314133, "learning_rate": 7.999940741094573e-06, "loss": -0.0788, "step": 236, "step_time": 10.412260244047502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.407407283782959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9889835119247437, "epoch": 0.00474, "frac_reward_zero_std": 0.0, "grad_norm": 1.078805685043335, "kl": 2.281186580657959, "learning_rate": 7.999940147026021e-06, "loss": -0.0645, "num_tokens": 5146212.0, "reward": 0.7990989685058594, "reward_std": 0.8374018669128418, "rewards/rollout_reward_func/mean": 0.7990989685058594, "rewards/rollout_reward_func/std": 0.8374017477035522, "sampling/importance_sampling_ratio/max": 1.5026054382324219, "sampling/importance_sampling_ratio/mean": 0.8050764799118042, "sampling/importance_sampling_ratio/min": 9.405759504588218e-10, "sampling/sampling_logp_difference/max": 3.893559217453003, "sampling/sampling_logp_difference/mean": 0.3726516366004944, "step": 237, "step_time": 20.50610365101602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.993059664964676, "epoch": 0.00476, "grad_norm": 0.19752249121665955, "kl": 0.6344181410968304, "learning_rate": 7.999939549994574e-06, "loss": -0.0717, "step": 238, "step_time": 10.686005841969745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1239258646965027, "epoch": 0.00478, "frac_reward_zero_std": 0.125, "grad_norm": 0.11218086630105972, "kl": 1.0945777893066406, "learning_rate": 7.99993895000023e-06, "loss": -0.0627, "num_tokens": 5196755.0, "reward": 0.6048902273178101, "reward_std": 0.8147047758102417, "rewards/rollout_reward_func/mean": 0.6048902273178101, "rewards/rollout_reward_func/std": 0.8147047758102417, "sampling/importance_sampling_ratio/max": 1.6144635677337646, "sampling/importance_sampling_ratio/mean": 0.8736696243286133, "sampling/importance_sampling_ratio/min": 0.004165679216384888, "sampling/sampling_logp_difference/max": 2.4899673461914062, "sampling/sampling_logp_difference/mean": 0.24134710431098938, "step": 239, "step_time": 20.59384802400018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1352317035198212, "epoch": 0.0048, "grad_norm": 0.10554298013448715, "kl": 0.9576783776283264, "learning_rate": 7.999938347042993e-06, "loss": -0.0629, "step": 240, "step_time": 10.817528838990256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 5.26086950302124, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1883776485919952, "epoch": 0.00482, "frac_reward_zero_std": 0.125, "grad_norm": 0.1460561752319336, "kl": 0.24759700521826744, "learning_rate": 7.999937741122862e-06, "loss": -0.0846, "num_tokens": 5242887.0, "reward": 0.5271003246307373, "reward_std": 1.0033009052276611, "rewards/rollout_reward_func/mean": 0.5271003246307373, "rewards/rollout_reward_func/std": 1.0033009052276611, "sampling/importance_sampling_ratio/max": 2.388491153717041, "sampling/importance_sampling_ratio/mean": 0.7808799147605896, "sampling/importance_sampling_ratio/min": 2.946344279664004e-10, "sampling/sampling_logp_difference/max": 2.7299699783325195, "sampling/sampling_logp_difference/mean": 0.3858220875263214, "step": 241, "step_time": 20.010562342009507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.187165230512619, "epoch": 0.00484, "grad_norm": 0.14067433774471283, "kl": 0.24876626208424568, "learning_rate": 7.999937132239836e-06, "loss": -0.0847, "step": 242, "step_time": 9.987134318012977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.059622049331665, "epoch": 0.00486, "frac_reward_zero_std": 0.25, "grad_norm": 0.15189388394355774, "kl": 0.31275397539138794, "learning_rate": 7.999936520393918e-06, "loss": -0.0687, "num_tokens": 5296811.0, "reward": 0.6613209843635559, "reward_std": 0.7357250452041626, "rewards/rollout_reward_func/mean": 0.6613209843635559, "rewards/rollout_reward_func/std": 0.7357249855995178, "sampling/importance_sampling_ratio/max": 1.6375248432159424, "sampling/importance_sampling_ratio/mean": 0.919368326663971, "sampling/importance_sampling_ratio/min": 6.304875569185242e-05, "sampling/sampling_logp_difference/max": 2.772951126098633, "sampling/sampling_logp_difference/mean": 0.2704923152923584, "step": 243, "step_time": 19.338929796998855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0564973279833794, "epoch": 0.00488, "grad_norm": 0.159287229180336, "kl": 0.3210660144686699, "learning_rate": 7.999935905585108e-06, "loss": -0.0695, "step": 244, "step_time": 10.413000942993676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.799999713897705, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.554336566478014, "epoch": 0.0049, "frac_reward_zero_std": 0.125, "grad_norm": 0.12169525027275085, "kl": 0.490428414195776, "learning_rate": 7.999935287813407e-06, "loss": -0.0848, "num_tokens": 5346795.0, "reward": 0.6931613683700562, "reward_std": 0.870171070098877, "rewards/rollout_reward_func/mean": 0.6931613683700562, "rewards/rollout_reward_func/std": 0.870171070098877, "sampling/importance_sampling_ratio/max": 2.548694372177124, "sampling/importance_sampling_ratio/mean": 0.9512590169906616, "sampling/importance_sampling_ratio/min": 1.8375289073446766e-05, "sampling/sampling_logp_difference/max": 2.4262290000915527, "sampling/sampling_logp_difference/mean": 0.3379410207271576, "step": 245, "step_time": 19.415126203995897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5491874143481255, "epoch": 0.00492, "grad_norm": 0.11604880541563034, "kl": 0.4837502986192703, "learning_rate": 7.999934667078813e-06, "loss": -0.0851, "step": 246, "step_time": 10.301584236993222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 4.6086955070495605, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.149538516998291, "epoch": 0.00494, "frac_reward_zero_std": 0.125, "grad_norm": 0.12034271657466888, "kl": 0.5594150945544243, "learning_rate": 7.999934043381328e-06, "loss": -0.0628, "num_tokens": 5399390.0, "reward": 0.45852380990982056, "reward_std": 0.7865856885910034, "rewards/rollout_reward_func/mean": 0.45852380990982056, "rewards/rollout_reward_func/std": 0.7865857481956482, "sampling/importance_sampling_ratio/max": 1.8297761678695679, "sampling/importance_sampling_ratio/mean": 0.7088649272918701, "sampling/importance_sampling_ratio/min": 7.800574769456503e-11, "sampling/sampling_logp_difference/max": 3.0988755226135254, "sampling/sampling_logp_difference/mean": 0.4473581314086914, "step": 247, "step_time": 20.949673597991932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.158322125673294, "epoch": 0.00496, "grad_norm": 0.12351498007774353, "kl": 0.5939491912722588, "learning_rate": 7.999933416720957e-06, "loss": -0.0629, "step": 248, "step_time": 10.912639581045369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6911098957061768, "epoch": 0.00498, "frac_reward_zero_std": 0.0, "grad_norm": 0.11086155474185944, "kl": 0.912641454488039, "learning_rate": 7.999932787097692e-06, "loss": -0.0662, "num_tokens": 5449218.0, "reward": 0.48129305243492126, "reward_std": 0.7560715079307556, "rewards/rollout_reward_func/mean": 0.48129305243492126, "rewards/rollout_reward_func/std": 0.7560714483261108, "sampling/importance_sampling_ratio/max": 1.5477876663208008, "sampling/importance_sampling_ratio/mean": 0.7166001796722412, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.6632065773010254, "sampling/sampling_logp_difference/mean": 0.3646930754184723, "step": 249, "step_time": 18.83555690399953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.6862019002437592, "epoch": 0.005, "grad_norm": 0.1103634312748909, "kl": 0.8947274461388588, "learning_rate": 7.999932154511542e-06, "loss": -0.0666, "step": 250, "step_time": 10.059021371009294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.035714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7122826278209686, "epoch": 0.00502, "frac_reward_zero_std": 0.0, "grad_norm": 0.32696419954299927, "kl": 0.681702122092247, "learning_rate": 7.999931518962502e-06, "loss": -0.0768, "num_tokens": 5493992.0, "reward": 0.6122889518737793, "reward_std": 0.9014061093330383, "rewards/rollout_reward_func/mean": 0.6122889518737793, "rewards/rollout_reward_func/std": 0.9014061093330383, "sampling/importance_sampling_ratio/max": 1.4876042604446411, "sampling/importance_sampling_ratio/mean": 0.8086925745010376, "sampling/importance_sampling_ratio/min": 0.0002516529930289835, "sampling/sampling_logp_difference/max": 2.056582450866699, "sampling/sampling_logp_difference/mean": 0.3114842474460602, "step": 251, "step_time": 18.616951744974358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01666666753590107, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01666666753590107, "entropy": 1.7040101289749146, "epoch": 0.00504, "grad_norm": 0.09491323679685593, "kl": 0.6478704437613487, "learning_rate": 7.999930880450575e-06, "loss": -0.0786, "step": 252, "step_time": 10.098614229005761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.678571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3318629711866379, "epoch": 0.00506, "frac_reward_zero_std": 0.0, "grad_norm": 0.103968545794487, "kl": 0.651387132704258, "learning_rate": 7.99993023897576e-06, "loss": -0.085, "num_tokens": 5548150.0, "reward": 0.7446229457855225, "reward_std": 0.7761821150779724, "rewards/rollout_reward_func/mean": 0.7446229457855225, "rewards/rollout_reward_func/std": 0.7761821150779724, "sampling/importance_sampling_ratio/max": 1.5751227140426636, "sampling/importance_sampling_ratio/mean": 0.8851406574249268, "sampling/importance_sampling_ratio/min": 1.1354047728673322e-06, "sampling/sampling_logp_difference/max": 2.360410451889038, "sampling/sampling_logp_difference/mean": 0.30614328384399414, "step": 253, "step_time": 19.589831507997587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3337761610746384, "epoch": 0.00508, "grad_norm": 0.0981573536992073, "kl": 0.6283219307661057, "learning_rate": 7.99992959453806e-06, "loss": -0.0851, "step": 254, "step_time": 10.479035235010087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.034482955932617, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4255894869565964, "epoch": 0.0051, "frac_reward_zero_std": 0.125, "grad_norm": 0.10664034634828568, "kl": 0.6939959302544594, "learning_rate": 7.999928947137475e-06, "loss": -0.0703, "num_tokens": 5604447.0, "reward": 0.4838522672653198, "reward_std": 0.7638020515441895, "rewards/rollout_reward_func/mean": 0.4838522672653198, "rewards/rollout_reward_func/std": 0.7638020515441895, "sampling/importance_sampling_ratio/max": 1.7803955078125, "sampling/importance_sampling_ratio/mean": 0.9051939249038696, "sampling/importance_sampling_ratio/min": 7.284365892701317e-07, "sampling/sampling_logp_difference/max": 2.854912281036377, "sampling/sampling_logp_difference/mean": 0.32347381114959717, "step": 255, "step_time": 20.331985655007884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0059523810632526875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 1.4275075197219849, "epoch": 0.00512, "grad_norm": 0.09180548042058945, "kl": 0.6978048086166382, "learning_rate": 7.999928296774006e-06, "loss": -0.0706, "step": 256, "step_time": 10.74746069201501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9553677439689636, "epoch": 0.00514, "frac_reward_zero_std": 0.0, "grad_norm": 0.14517028629779816, "kl": 0.7832383140921593, "learning_rate": 7.999927643447652e-06, "loss": -0.096, "num_tokens": 5650653.0, "reward": 0.2704532742500305, "reward_std": 0.8748576045036316, "rewards/rollout_reward_func/mean": 0.2704532742500305, "rewards/rollout_reward_func/std": 0.8748575448989868, "sampling/importance_sampling_ratio/max": 1.902116060256958, "sampling/importance_sampling_ratio/mean": 0.7299733757972717, "sampling/importance_sampling_ratio/min": 4.2777578528330196e-06, "sampling/sampling_logp_difference/max": 3.130918025970459, "sampling/sampling_logp_difference/mean": 0.3192397356033325, "step": 257, "step_time": 19.2431629000057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.947787582874298, "epoch": 0.00516, "grad_norm": 0.14433977007865906, "kl": 0.7402839586138725, "learning_rate": 7.999926987158413e-06, "loss": -0.0961, "step": 258, "step_time": 10.161858364968793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.407407283782959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0797281973063946, "epoch": 0.00518, "frac_reward_zero_std": 0.25, "grad_norm": 0.07991296797990799, "kl": 0.428208164870739, "learning_rate": 7.999926327906292e-06, "loss": -0.09, "num_tokens": 5695381.0, "reward": 0.5930756330490112, "reward_std": 0.8962921500205994, "rewards/rollout_reward_func/mean": 0.5930756330490112, "rewards/rollout_reward_func/std": 0.8962920904159546, "sampling/importance_sampling_ratio/max": 2.8308000564575195, "sampling/importance_sampling_ratio/mean": 1.0047062635421753, "sampling/importance_sampling_ratio/min": 2.6294213967048563e-05, "sampling/sampling_logp_difference/max": 2.0498318672180176, "sampling/sampling_logp_difference/mean": 0.21342867612838745, "step": 259, "step_time": 18.29826575500192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0739120095968246, "epoch": 0.0052, "grad_norm": 0.0801253616809845, "kl": 0.4228803999722004, "learning_rate": 7.999925665691289e-06, "loss": -0.0902, "step": 260, "step_time": 9.754891024000244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.838709354400635, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.243543416261673, "epoch": 0.00522, "frac_reward_zero_std": 0.25, "grad_norm": 0.16970194876194, "kl": 0.7607885748147964, "learning_rate": 7.999925000513405e-06, "loss": -0.0523, "num_tokens": 5746694.0, "reward": 0.721305251121521, "reward_std": 0.707994818687439, "rewards/rollout_reward_func/mean": 0.721305251121521, "rewards/rollout_reward_func/std": 0.707994818687439, "sampling/importance_sampling_ratio/max": 1.9889949560165405, "sampling/importance_sampling_ratio/mean": 0.8384492993354797, "sampling/importance_sampling_ratio/min": 7.390376413241029e-05, "sampling/sampling_logp_difference/max": 2.3554959297180176, "sampling/sampling_logp_difference/mean": 0.28235578536987305, "step": 261, "step_time": 19.579455836006673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2385703325271606, "epoch": 0.00524, "grad_norm": 0.14338445663452148, "kl": 0.787886917591095, "learning_rate": 7.999924332372639e-06, "loss": -0.053, "step": 262, "step_time": 10.61086151903146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.208333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9665509760379791, "epoch": 0.00526, "frac_reward_zero_std": 0.0, "grad_norm": 0.1131443902850151, "kl": 0.36667894572019577, "learning_rate": 7.999923661268994e-06, "loss": -0.0912, "num_tokens": 5804106.0, "reward": 0.5122036933898926, "reward_std": 0.7502248883247375, "rewards/rollout_reward_func/mean": 0.5122036933898926, "rewards/rollout_reward_func/std": 0.7502248883247375, "sampling/importance_sampling_ratio/max": 1.6349292993545532, "sampling/importance_sampling_ratio/mean": 0.8756980299949646, "sampling/importance_sampling_ratio/min": 6.6694096858554985e-06, "sampling/sampling_logp_difference/max": 1.8561664819717407, "sampling/sampling_logp_difference/mean": 0.3564293384552002, "step": 263, "step_time": 19.918457618012326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9587583541870117, "epoch": 0.00528, "grad_norm": 0.11127851158380508, "kl": 0.3979309946298599, "learning_rate": 7.999922987202466e-06, "loss": -0.0912, "step": 264, "step_time": 10.403921502031153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.71999979019165, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7972119897603989, "epoch": 0.0053, "frac_reward_zero_std": 0.125, "grad_norm": 0.08697140216827393, "kl": 0.7979226112365723, "learning_rate": 7.999922310173063e-06, "loss": -0.0868, "num_tokens": 5854497.0, "reward": 0.5051333904266357, "reward_std": 0.886615514755249, "rewards/rollout_reward_func/mean": 0.5051333904266357, "rewards/rollout_reward_func/std": 0.886615514755249, "sampling/importance_sampling_ratio/max": 1.7637633085250854, "sampling/importance_sampling_ratio/mean": 0.754416286945343, "sampling/importance_sampling_ratio/min": 5.692526201528381e-07, "sampling/sampling_logp_difference/max": 2.414681911468506, "sampling/sampling_logp_difference/mean": 0.3456866443157196, "step": 265, "step_time": 19.942066546995193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7891065329313278, "epoch": 0.00532, "grad_norm": 0.08527904748916626, "kl": 0.8184904977679253, "learning_rate": 7.99992163018078e-06, "loss": -0.087, "step": 266, "step_time": 10.25487259196234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5143216624855995, "epoch": 0.00534, "frac_reward_zero_std": 0.0, "grad_norm": 0.09239692240953445, "kl": 0.4889365918934345, "learning_rate": 7.99992094722562e-06, "loss": -0.0844, "num_tokens": 5906138.0, "reward": 0.4200291037559509, "reward_std": 0.9061587452888489, "rewards/rollout_reward_func/mean": 0.4200291037559509, "rewards/rollout_reward_func/std": 0.9061588048934937, "sampling/importance_sampling_ratio/max": 1.7964181900024414, "sampling/importance_sampling_ratio/mean": 0.7973607778549194, "sampling/importance_sampling_ratio/min": 1.0372003089287318e-05, "sampling/sampling_logp_difference/max": 2.474907398223877, "sampling/sampling_logp_difference/mean": 0.3249526619911194, "step": 267, "step_time": 21.17846169898985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.509746141731739, "epoch": 0.00536, "grad_norm": 0.08422065526247025, "kl": 0.4647878631949425, "learning_rate": 7.999920261307583e-06, "loss": -0.0845, "step": 268, "step_time": 11.440782975987531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9848345220088959, "epoch": 0.00538, "frac_reward_zero_std": 0.0, "grad_norm": 0.10450567305088043, "kl": 0.4757564440369606, "learning_rate": 7.999919572426668e-06, "loss": -0.08, "num_tokens": 5957004.0, "reward": 0.5344504714012146, "reward_std": 0.8763284683227539, "rewards/rollout_reward_func/mean": 0.5344504714012146, "rewards/rollout_reward_func/std": 0.8763284683227539, "sampling/importance_sampling_ratio/max": 1.459186315536499, "sampling/importance_sampling_ratio/mean": 0.8280544281005859, "sampling/importance_sampling_ratio/min": 1.1582258069298135e-10, "sampling/sampling_logp_difference/max": 2.4627671241760254, "sampling/sampling_logp_difference/mean": 0.4366001486778259, "step": 269, "step_time": 19.095572716963943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9844918549060822, "epoch": 0.0054, "grad_norm": 0.10617558658123016, "kl": 0.43579693883657455, "learning_rate": 7.999918880582879e-06, "loss": -0.0802, "step": 270, "step_time": 9.959784593986114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.694269523024559, "epoch": 0.00542, "frac_reward_zero_std": 0.0, "grad_norm": 0.16387590765953064, "kl": 0.6437830999493599, "learning_rate": 7.999918185776215e-06, "loss": -0.0752, "num_tokens": 6010600.0, "reward": 0.569720447063446, "reward_std": 0.8728220462799072, "rewards/rollout_reward_func/mean": 0.569720447063446, "rewards/rollout_reward_func/std": 0.8728220462799072, "sampling/importance_sampling_ratio/max": 2.36026668548584, "sampling/importance_sampling_ratio/mean": 0.8980359435081482, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3232154846191406, "sampling/sampling_logp_difference/mean": 0.34887683391571045, "step": 271, "step_time": 19.89554290700471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6978454738855362, "epoch": 0.00544, "grad_norm": 0.16330693662166595, "kl": 0.6279420182108879, "learning_rate": 7.999917488006676e-06, "loss": -0.0754, "step": 272, "step_time": 10.470838478970109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9261908531188965, "epoch": 0.00546, "frac_reward_zero_std": 0.0, "grad_norm": 0.07824838906526566, "kl": 0.5205772593617439, "learning_rate": 7.999916787274264e-06, "loss": -0.0825, "num_tokens": 6059840.0, "reward": 0.5282260775566101, "reward_std": 0.8667983412742615, "rewards/rollout_reward_func/mean": 0.5282260775566101, "rewards/rollout_reward_func/std": 0.8667983412742615, "sampling/importance_sampling_ratio/max": 2.2113633155822754, "sampling/importance_sampling_ratio/mean": 0.8311017751693726, "sampling/importance_sampling_ratio/min": 1.9722568822544417e-07, "sampling/sampling_logp_difference/max": 2.319361686706543, "sampling/sampling_logp_difference/mean": 0.3701004385948181, "step": 273, "step_time": 19.27680981697631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9277043342590332, "epoch": 0.00548, "grad_norm": 0.08258850872516632, "kl": 0.5296620354056358, "learning_rate": 7.99991608357898e-06, "loss": -0.0824, "step": 274, "step_time": 10.288766915007727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 5.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.651670455932617, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 0.12131093442440033, "kl": 0.5302865207195282, "learning_rate": 7.999915376920822e-06, "loss": -0.0573, "num_tokens": 6115920.0, "reward": 0.5838415622711182, "reward_std": 0.846586287021637, "rewards/rollout_reward_func/mean": 0.5838415622711182, "rewards/rollout_reward_func/std": 0.846586287021637, "sampling/importance_sampling_ratio/max": 1.7145540714263916, "sampling/importance_sampling_ratio/mean": 0.664065957069397, "sampling/importance_sampling_ratio/min": 8.211888768983044e-08, "sampling/sampling_logp_difference/max": 2.796105146408081, "sampling/sampling_logp_difference/mean": 0.47569888830184937, "step": 275, "step_time": 20.37544083103421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.660217583179474, "epoch": 0.00552, "grad_norm": 0.12161408364772797, "kl": 0.5265777707099915, "learning_rate": 7.999914667299794e-06, "loss": -0.0575, "step": 276, "step_time": 10.829526775021804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.230769634246826, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0185680091381073, "epoch": 0.00554, "frac_reward_zero_std": 0.25, "grad_norm": 0.12265998870134354, "kl": 0.5694177635014057, "learning_rate": 7.999913954715895e-06, "loss": -0.0495, "num_tokens": 6165704.0, "reward": 0.27002155780792236, "reward_std": 0.7098234295845032, "rewards/rollout_reward_func/mean": 0.27002155780792236, "rewards/rollout_reward_func/std": 0.7098234295845032, "sampling/importance_sampling_ratio/max": 1.7337957620620728, "sampling/importance_sampling_ratio/mean": 0.7076838612556458, "sampling/importance_sampling_ratio/min": 4.944054499134154e-09, "sampling/sampling_logp_difference/max": 2.717682123184204, "sampling/sampling_logp_difference/mean": 0.42215806245803833, "step": 277, "step_time": 20.17431084101554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0193710327148438, "epoch": 0.00556, "grad_norm": 0.12367068976163864, "kl": 0.5304573178291321, "learning_rate": 7.999913239169126e-06, "loss": -0.05, "step": 278, "step_time": 10.955055808008183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.440000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.013393461704254, "epoch": 0.00558, "frac_reward_zero_std": 0.125, "grad_norm": 0.10810369998216629, "kl": 0.6009195521473885, "learning_rate": 7.999912520659488e-06, "loss": -0.0491, "num_tokens": 6218588.0, "reward": 0.6019015908241272, "reward_std": 0.7366352081298828, "rewards/rollout_reward_func/mean": 0.6019015908241272, "rewards/rollout_reward_func/std": 0.7366352081298828, "sampling/importance_sampling_ratio/max": 1.831655502319336, "sampling/importance_sampling_ratio/mean": 0.6712236404418945, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.8864004611968994, "sampling/sampling_logp_difference/mean": 0.3856047987937927, "step": 279, "step_time": 20.15274159196997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0143361389636993, "epoch": 0.0056, "grad_norm": 0.09752732515335083, "kl": 0.5253709703683853, "learning_rate": 7.99991179918698e-06, "loss": -0.0494, "step": 280, "step_time": 10.35197304704343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3035928085446358, "epoch": 0.00562, "frac_reward_zero_std": 0.25, "grad_norm": 0.11928323656320572, "kl": 0.6923801302909851, "learning_rate": 7.999911074751606e-06, "loss": -0.0655, "num_tokens": 6267919.0, "reward": 0.37428221106529236, "reward_std": 0.8213624358177185, "rewards/rollout_reward_func/mean": 0.37428221106529236, "rewards/rollout_reward_func/std": 0.8213623762130737, "sampling/importance_sampling_ratio/max": 1.3713129758834839, "sampling/importance_sampling_ratio/mean": 0.8370892405509949, "sampling/importance_sampling_ratio/min": 9.440030407859012e-05, "sampling/sampling_logp_difference/max": 2.021775245666504, "sampling/sampling_logp_difference/mean": 0.24083802103996277, "step": 281, "step_time": 19.05344401698676 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 1.3077930957078934, "epoch": 0.00564, "grad_norm": 0.1132296621799469, "kl": 0.6565688997507095, "learning_rate": 7.999910347353363e-06, "loss": -0.066, "step": 282, "step_time": 10.129132980015129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 5.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7393870949745178, "epoch": 0.00566, "frac_reward_zero_std": 0.125, "grad_norm": 0.33585888147354126, "kl": 0.49967265874147415, "learning_rate": 7.999909616992255e-06, "loss": -0.0408, "num_tokens": 6322705.0, "reward": 0.5822451114654541, "reward_std": 0.6233378052711487, "rewards/rollout_reward_func/mean": 0.5822451114654541, "rewards/rollout_reward_func/std": 0.6233378052711487, "sampling/importance_sampling_ratio/max": 1.767797827720642, "sampling/importance_sampling_ratio/mean": 0.8576176166534424, "sampling/importance_sampling_ratio/min": 3.282034413132351e-07, "sampling/sampling_logp_difference/max": 2.736767292022705, "sampling/sampling_logp_difference/mean": 0.39117011427879333, "step": 283, "step_time": 19.98805973600247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.7401084303855896, "epoch": 0.00568, "grad_norm": 0.2697790563106537, "kl": 0.46328238770365715, "learning_rate": 7.99990888366828e-06, "loss": -0.0419, "step": 284, "step_time": 10.386812303971965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.192307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3586751818656921, "epoch": 0.0057, "frac_reward_zero_std": 0.375, "grad_norm": 0.15063545107841492, "kl": 0.4010852947831154, "learning_rate": 7.99990814738144e-06, "loss": -0.0545, "num_tokens": 6373892.0, "reward": 0.7937209606170654, "reward_std": 0.8377634286880493, "rewards/rollout_reward_func/mean": 0.7937209606170654, "rewards/rollout_reward_func/std": 0.8377634286880493, "sampling/importance_sampling_ratio/max": 1.6491740942001343, "sampling/importance_sampling_ratio/mean": 0.9073297381401062, "sampling/importance_sampling_ratio/min": 2.5944873414118774e-06, "sampling/sampling_logp_difference/max": 1.8357574939727783, "sampling/sampling_logp_difference/mean": 0.27837657928466797, "step": 285, "step_time": 19.391275822999887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.3567507863044739, "epoch": 0.00572, "grad_norm": 0.1478983461856842, "kl": 0.4095845893025398, "learning_rate": 7.999907408131737e-06, "loss": -0.0545, "step": 286, "step_time": 10.785504935018253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6172389090061188, "epoch": 0.00574, "frac_reward_zero_std": 0.125, "grad_norm": 0.07996816188097, "kl": 0.5492328591644764, "learning_rate": 7.999906665919169e-06, "loss": -0.0477, "num_tokens": 6416956.0, "reward": 0.32173049449920654, "reward_std": 0.9692912101745605, "rewards/rollout_reward_func/mean": 0.32173049449920654, "rewards/rollout_reward_func/std": 0.969291090965271, "sampling/importance_sampling_ratio/max": 2.048434257507324, "sampling/importance_sampling_ratio/mean": 0.8263925909996033, "sampling/importance_sampling_ratio/min": 8.55891073570092e-07, "sampling/sampling_logp_difference/max": 1.9007534980773926, "sampling/sampling_logp_difference/mean": 0.31872397661209106, "step": 287, "step_time": 18.494957137969323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.621228352189064, "epoch": 0.00576, "grad_norm": 0.08006361126899719, "kl": 0.5515300147235394, "learning_rate": 7.99990592074374e-06, "loss": -0.0479, "step": 288, "step_time": 9.944857419031905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.148148059844971, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9846222475171089, "epoch": 0.00578, "frac_reward_zero_std": 0.125, "grad_norm": 0.06500504165887833, "kl": 0.3327351063489914, "learning_rate": 7.999905172605446e-06, "loss": -0.0536, "num_tokens": 6461682.0, "reward": 0.8517476320266724, "reward_std": 0.888813853263855, "rewards/rollout_reward_func/mean": 0.8517476320266724, "rewards/rollout_reward_func/std": 0.8888139128684998, "sampling/importance_sampling_ratio/max": 1.8274874687194824, "sampling/importance_sampling_ratio/mean": 1.000128149986267, "sampling/importance_sampling_ratio/min": 0.0013895452721044421, "sampling/sampling_logp_difference/max": 1.622391939163208, "sampling/sampling_logp_difference/mean": 0.18457946181297302, "step": 289, "step_time": 18.713624304014957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9841331392526627, "epoch": 0.0058, "grad_norm": 0.06372790783643723, "kl": 0.3235144875943661, "learning_rate": 7.999904421504293e-06, "loss": -0.0537, "step": 290, "step_time": 10.026365020981757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.429680660367012, "epoch": 0.00582, "frac_reward_zero_std": 0.125, "grad_norm": 0.0841754749417305, "kl": 0.3977566808462143, "learning_rate": 7.999903667440278e-06, "loss": -0.0664, "num_tokens": 6508514.0, "reward": 0.4262467920780182, "reward_std": 0.8819931745529175, "rewards/rollout_reward_func/mean": 0.4262467920780182, "rewards/rollout_reward_func/std": 0.8819932341575623, "sampling/importance_sampling_ratio/max": 1.3943120241165161, "sampling/importance_sampling_ratio/mean": 0.869870662689209, "sampling/importance_sampling_ratio/min": 2.4207210458371264e-08, "sampling/sampling_logp_difference/max": 2.0801596641540527, "sampling/sampling_logp_difference/mean": 0.2724110782146454, "step": 291, "step_time": 19.334683510969626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4310917109251022, "epoch": 0.00584, "grad_norm": 0.08603332191705704, "kl": 0.3891299143433571, "learning_rate": 7.999902910413404e-06, "loss": -0.0667, "step": 292, "step_time": 10.142046911001671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2859373465180397, "epoch": 0.00586, "frac_reward_zero_std": 0.25, "grad_norm": 0.12689325213432312, "kl": 0.9296894371509552, "learning_rate": 7.999902150423671e-06, "loss": -0.0628, "num_tokens": 6556444.0, "reward": 0.6893253922462463, "reward_std": 0.7919260859489441, "rewards/rollout_reward_func/mean": 0.6893253922462463, "rewards/rollout_reward_func/std": 0.7919260859489441, "sampling/importance_sampling_ratio/max": 1.7948040962219238, "sampling/importance_sampling_ratio/mean": 0.8522982597351074, "sampling/importance_sampling_ratio/min": 2.4697501430637203e-05, "sampling/sampling_logp_difference/max": 1.866982340812683, "sampling/sampling_logp_difference/mean": 0.300201952457428, "step": 293, "step_time": 19.412768878944917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2830346710979939, "epoch": 0.00588, "grad_norm": 0.12686339020729065, "kl": 0.8670111671090126, "learning_rate": 7.999901387471079e-06, "loss": -0.0635, "step": 294, "step_time": 10.611721016990487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2055876404047012, "epoch": 0.0059, "frac_reward_zero_std": 0.25, "grad_norm": 0.08178962022066116, "kl": 0.5775078162550926, "learning_rate": 7.99990062155563e-06, "loss": -0.0279, "num_tokens": 6605905.0, "reward": 0.7406965494155884, "reward_std": 0.7666766047477722, "rewards/rollout_reward_func/mean": 0.7406965494155884, "rewards/rollout_reward_func/std": 0.7666766047477722, "sampling/importance_sampling_ratio/max": 1.8312915563583374, "sampling/importance_sampling_ratio/mean": 0.8288171291351318, "sampling/importance_sampling_ratio/min": 2.9767614250886254e-05, "sampling/sampling_logp_difference/max": 1.8071346282958984, "sampling/sampling_logp_difference/mean": 0.302636057138443, "step": 295, "step_time": 18.711901286034845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2075300514698029, "epoch": 0.00592, "grad_norm": 0.08346956968307495, "kl": 0.5473222658038139, "learning_rate": 7.999899852677322e-06, "loss": -0.0279, "step": 296, "step_time": 10.583130409009755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.92307710647583, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0460804998874664, "epoch": 0.00594, "frac_reward_zero_std": 0.0, "grad_norm": 0.14142119884490967, "kl": 0.3593020588159561, "learning_rate": 7.99989908083616e-06, "loss": -0.0678, "num_tokens": 6663524.0, "reward": 0.3855300545692444, "reward_std": 0.7669370174407959, "rewards/rollout_reward_func/mean": 0.3855300545692444, "rewards/rollout_reward_func/std": 0.7669370174407959, "sampling/importance_sampling_ratio/max": 1.4099926948547363, "sampling/importance_sampling_ratio/mean": 0.6105189323425293, "sampling/importance_sampling_ratio/min": 1.7898045143738273e-06, "sampling/sampling_logp_difference/max": 2.31308650970459, "sampling/sampling_logp_difference/mean": 0.36062660813331604, "step": 297, "step_time": 21.643145267968066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0444688498973846, "epoch": 0.00596, "grad_norm": 0.139652818441391, "kl": 0.36858946830034256, "learning_rate": 7.999898306032144e-06, "loss": -0.0681, "step": 298, "step_time": 10.833400790957967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.555555820465088, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6824069917201996, "epoch": 0.00598, "frac_reward_zero_std": 0.125, "grad_norm": 0.12855975329875946, "kl": 0.3188205100595951, "learning_rate": 7.999897528265272e-06, "loss": -0.083, "num_tokens": 6714502.0, "reward": 0.6459416151046753, "reward_std": 0.8599050045013428, "rewards/rollout_reward_func/mean": 0.6459416151046753, "rewards/rollout_reward_func/std": 0.8599048852920532, "sampling/importance_sampling_ratio/max": 1.9543044567108154, "sampling/importance_sampling_ratio/mean": 0.8364782333374023, "sampling/importance_sampling_ratio/min": 9.229766874341294e-06, "sampling/sampling_logp_difference/max": 1.6775298118591309, "sampling/sampling_logp_difference/mean": 0.3150531053543091, "step": 299, "step_time": 19.993144074018346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.678859829902649, "epoch": 0.006, "grad_norm": 0.13647092878818512, "kl": 0.30484165623784065, "learning_rate": 7.999896747535546e-06, "loss": -0.0831, "step": 300, "step_time": 10.421236378955655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 4.066667079925537, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7892234995961189, "epoch": 0.00602, "frac_reward_zero_std": 0.25, "grad_norm": 0.22182412445545197, "kl": 1.0346800237894058, "learning_rate": 7.999895963842966e-06, "loss": -0.0659, "num_tokens": 6757282.0, "reward": 0.9680888652801514, "reward_std": 0.6736341714859009, "rewards/rollout_reward_func/mean": 0.9680888652801514, "rewards/rollout_reward_func/std": 0.6736341118812561, "sampling/importance_sampling_ratio/max": 2.85237455368042, "sampling/importance_sampling_ratio/mean": 1.1300196647644043, "sampling/importance_sampling_ratio/min": 2.4664368325488795e-08, "sampling/sampling_logp_difference/max": 2.0234031677246094, "sampling/sampling_logp_difference/mean": 0.22374819219112396, "step": 301, "step_time": 17.704747738025617 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.7893822491168976, "epoch": 0.00604, "grad_norm": 0.09706810861825943, "kl": 1.1065162867307663, "learning_rate": 7.999895177187535e-06, "loss": -0.0665, "step": 302, "step_time": 10.103775307012256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.208333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6108124256134033, "epoch": 0.00606, "frac_reward_zero_std": 0.125, "grad_norm": 0.20319846272468567, "kl": 0.4226079061627388, "learning_rate": 7.99989438756925e-06, "loss": -0.0646, "num_tokens": 6807746.0, "reward": 0.3641650378704071, "reward_std": 0.8498501777648926, "rewards/rollout_reward_func/mean": 0.3641650378704071, "rewards/rollout_reward_func/std": 0.8498501181602478, "sampling/importance_sampling_ratio/max": 1.8835744857788086, "sampling/importance_sampling_ratio/mean": 0.7976277470588684, "sampling/importance_sampling_ratio/min": 1.1829862387457979e-06, "sampling/sampling_logp_difference/max": 2.571394920349121, "sampling/sampling_logp_difference/mean": 0.32586348056793213, "step": 303, "step_time": 21.059126978943823 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "entropy": 1.612661600112915, "epoch": 0.00608, "grad_norm": 0.14183658361434937, "kl": 0.4350764974951744, "learning_rate": 7.999893594988118e-06, "loss": -0.0654, "step": 304, "step_time": 10.635492463043192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.827882245182991, "epoch": 0.0061, "frac_reward_zero_std": 0.0, "grad_norm": 0.16309231519699097, "kl": 0.6451245844364166, "learning_rate": 7.999892799444135e-06, "loss": -0.0496, "num_tokens": 6858985.0, "reward": 0.48441407084465027, "reward_std": 0.8022962808609009, "rewards/rollout_reward_func/mean": 0.48441407084465027, "rewards/rollout_reward_func/std": 0.8022962808609009, "sampling/importance_sampling_ratio/max": 1.7515149116516113, "sampling/importance_sampling_ratio/mean": 0.8039590120315552, "sampling/importance_sampling_ratio/min": 2.6552060106155295e-08, "sampling/sampling_logp_difference/max": 2.6736412048339844, "sampling/sampling_logp_difference/mean": 0.3493462800979614, "step": 305, "step_time": 20.33701942296466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.834990680217743, "epoch": 0.00612, "grad_norm": 0.1662326455116272, "kl": 0.640768900513649, "learning_rate": 7.999892000937302e-06, "loss": -0.0504, "step": 306, "step_time": 11.393044572963845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7129669189453125, "epoch": 0.00614, "frac_reward_zero_std": 0.125, "grad_norm": 0.19158321619033813, "kl": 0.29491056129336357, "learning_rate": 7.99989119946762e-06, "loss": -0.05, "num_tokens": 6912377.0, "reward": 0.07519660145044327, "reward_std": 0.6968924403190613, "rewards/rollout_reward_func/mean": 0.07519660145044327, "rewards/rollout_reward_func/std": 0.6968924403190613, "sampling/importance_sampling_ratio/max": 1.7719920873641968, "sampling/importance_sampling_ratio/mean": 0.6902287602424622, "sampling/importance_sampling_ratio/min": 5.266737090892093e-08, "sampling/sampling_logp_difference/max": 2.314685344696045, "sampling/sampling_logp_difference/mean": 0.32926714420318604, "step": 307, "step_time": 20.54742549697403 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629629664123058, "entropy": 1.722551703453064, "epoch": 0.00616, "grad_norm": 0.17938557267189026, "kl": 0.2794007621705532, "learning_rate": 7.999890395035091e-06, "loss": -0.0507, "step": 308, "step_time": 10.86402676999569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7514342069625854, "epoch": 0.00618, "frac_reward_zero_std": 0.125, "grad_norm": 0.10361240804195404, "kl": 0.5858623310923576, "learning_rate": 7.999889587639716e-06, "loss": -0.0554, "num_tokens": 6963068.0, "reward": 0.38754308223724365, "reward_std": 0.8793056607246399, "rewards/rollout_reward_func/mean": 0.38754308223724365, "rewards/rollout_reward_func/std": 0.8793056011199951, "sampling/importance_sampling_ratio/max": 2.100198745727539, "sampling/importance_sampling_ratio/mean": 0.712914228439331, "sampling/importance_sampling_ratio/min": 2.532409559830029e-10, "sampling/sampling_logp_difference/max": 2.218039035797119, "sampling/sampling_logp_difference/mean": 0.3677341938018799, "step": 309, "step_time": 19.56423978402745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7600246369838715, "epoch": 0.0062, "grad_norm": 0.10563324391841888, "kl": 0.574613057076931, "learning_rate": 7.999888777281495e-06, "loss": -0.0551, "step": 310, "step_time": 10.31094749100157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 5.8275861740112305, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6940365731716156, "epoch": 0.00622, "frac_reward_zero_std": 0.125, "grad_norm": 0.13662810623645782, "kl": 0.43830006942152977, "learning_rate": 7.999887963960429e-06, "loss": -0.0612, "num_tokens": 7013805.0, "reward": 0.6194454431533813, "reward_std": 0.862967848777771, "rewards/rollout_reward_func/mean": 0.6194454431533813, "rewards/rollout_reward_func/std": 0.862967848777771, "sampling/importance_sampling_ratio/max": 2.23323917388916, "sampling/importance_sampling_ratio/mean": 0.8283082246780396, "sampling/importance_sampling_ratio/min": 0.000438174232840538, "sampling/sampling_logp_difference/max": 2.0160927772521973, "sampling/sampling_logp_difference/mean": 0.3046584725379944, "step": 311, "step_time": 19.80398103001062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6995736360549927, "epoch": 0.00624, "grad_norm": 0.13523833453655243, "kl": 0.43398138135671616, "learning_rate": 7.999887147676517e-06, "loss": -0.061, "step": 312, "step_time": 10.385881345981034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.928571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0972928404808044, "epoch": 0.00626, "frac_reward_zero_std": 0.0, "grad_norm": 0.14171050488948822, "kl": 0.43944698199629784, "learning_rate": 7.999886328429762e-06, "loss": -0.1126, "num_tokens": 7069485.0, "reward": 0.6736783981323242, "reward_std": 0.7308452129364014, "rewards/rollout_reward_func/mean": 0.6736783981323242, "rewards/rollout_reward_func/std": 0.7308452129364014, "sampling/importance_sampling_ratio/max": 2.2094626426696777, "sampling/importance_sampling_ratio/mean": 0.7683344483375549, "sampling/importance_sampling_ratio/min": 4.989681201550411e-08, "sampling/sampling_logp_difference/max": 2.2281198501586914, "sampling/sampling_logp_difference/mean": 0.4363349676132202, "step": 313, "step_time": 20.056135535007343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0941530019044876, "epoch": 0.00628, "grad_norm": 0.14074839651584625, "kl": 0.4618655666708946, "learning_rate": 7.999885506220164e-06, "loss": -0.1132, "step": 314, "step_time": 10.494689543003915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3798422068357468, "epoch": 0.0063, "frac_reward_zero_std": 0.125, "grad_norm": 0.14875151216983795, "kl": 0.41901033371686935, "learning_rate": 7.999884681047726e-06, "loss": -0.0439, "num_tokens": 7125264.0, "reward": 0.6187124252319336, "reward_std": 0.6882176995277405, "rewards/rollout_reward_func/mean": 0.6187124252319336, "rewards/rollout_reward_func/std": 0.6882176995277405, "sampling/importance_sampling_ratio/max": 2.71128511428833, "sampling/importance_sampling_ratio/mean": 1.0000075101852417, "sampling/importance_sampling_ratio/min": 1.0218305988018983e-06, "sampling/sampling_logp_difference/max": 1.9553256034851074, "sampling/sampling_logp_difference/mean": 0.2788470387458801, "step": 315, "step_time": 21.186744752019877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002659574383869767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002659574383869767, "entropy": 1.3695815205574036, "epoch": 0.00632, "grad_norm": 0.13849863409996033, "kl": 0.4259873628616333, "learning_rate": 7.999883852912445e-06, "loss": -0.0449, "step": 316, "step_time": 11.689436581946211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9392183125019073, "epoch": 0.00634, "frac_reward_zero_std": 0.0, "grad_norm": 0.16610457003116608, "kl": 1.3871766850352287, "learning_rate": 7.999883021814325e-06, "loss": -0.03, "num_tokens": 7176640.0, "reward": 0.47437185049057007, "reward_std": 0.807435929775238, "rewards/rollout_reward_func/mean": 0.47437185049057007, "rewards/rollout_reward_func/std": 0.807435929775238, "sampling/importance_sampling_ratio/max": 2.5742366313934326, "sampling/importance_sampling_ratio/mean": 0.9037485122680664, "sampling/importance_sampling_ratio/min": 9.443802991881967e-05, "sampling/sampling_logp_difference/max": 1.9103825092315674, "sampling/sampling_logp_difference/mean": 0.23763175308704376, "step": 317, "step_time": 19.261472239973955 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9268378168344498, "epoch": 0.00636, "grad_norm": 0.15284433960914612, "kl": 1.3246966004371643, "learning_rate": 7.999882187753364e-06, "loss": -0.0304, "step": 318, "step_time": 10.390502506983466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.821476697921753, "epoch": 0.00638, "frac_reward_zero_std": 0.0, "grad_norm": 0.07837769389152527, "kl": 0.980635404586792, "learning_rate": 7.999881350729566e-06, "loss": -0.1093, "num_tokens": 7227974.0, "reward": 0.3693258464336395, "reward_std": 0.7973827719688416, "rewards/rollout_reward_func/mean": 0.3693258464336395, "rewards/rollout_reward_func/std": 0.7973827123641968, "sampling/importance_sampling_ratio/max": 2.0577569007873535, "sampling/importance_sampling_ratio/mean": 0.7657604217529297, "sampling/importance_sampling_ratio/min": 9.398358429280051e-07, "sampling/sampling_logp_difference/max": 3.1880900859832764, "sampling/sampling_logp_difference/mean": 0.35970139503479004, "step": 319, "step_time": 19.07459952897625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008774630725383759, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008774630725383759, "entropy": 1.815565139055252, "epoch": 0.0064, "grad_norm": 0.07223418354988098, "kl": 1.0028435289859772, "learning_rate": 7.999880510742928e-06, "loss": -0.1096, "step": 320, "step_time": 10.141802183003165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.407407283782959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6628663539886475, "epoch": 0.00642, "frac_reward_zero_std": 0.0, "grad_norm": 0.0722305178642273, "kl": 0.6633777618408203, "learning_rate": 7.999879667793456e-06, "loss": -0.0786, "num_tokens": 7281250.0, "reward": 0.5327632427215576, "reward_std": 0.8501482605934143, "rewards/rollout_reward_func/mean": 0.5327632427215576, "rewards/rollout_reward_func/std": 0.8501482009887695, "sampling/importance_sampling_ratio/max": 1.67691171169281, "sampling/importance_sampling_ratio/mean": 0.7138810157775879, "sampling/importance_sampling_ratio/min": 1.6674630387569778e-05, "sampling/sampling_logp_difference/max": 1.989957332611084, "sampling/sampling_logp_difference/mean": 0.2851901650428772, "step": 321, "step_time": 22.922753209015355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6638502478599548, "epoch": 0.00644, "grad_norm": 0.07229526340961456, "kl": 0.6687800958752632, "learning_rate": 7.999878821881145e-06, "loss": -0.0788, "step": 322, "step_time": 12.35686532096588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 3.965517282485962, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.811308316886425, "epoch": 0.00646, "frac_reward_zero_std": 0.125, "grad_norm": 0.08858391642570496, "kl": 0.5224760621786118, "learning_rate": 7.999877973006e-06, "loss": -0.0508, "num_tokens": 7337600.0, "reward": 0.406252384185791, "reward_std": 0.7972450256347656, "rewards/rollout_reward_func/mean": 0.406252384185791, "rewards/rollout_reward_func/std": 0.7972450852394104, "sampling/importance_sampling_ratio/max": 2.0220792293548584, "sampling/importance_sampling_ratio/mean": 0.95490962266922, "sampling/importance_sampling_ratio/min": 0.0014490928733721375, "sampling/sampling_logp_difference/max": 2.42386531829834, "sampling/sampling_logp_difference/mean": 0.19295379519462585, "step": 323, "step_time": 23.935349838982802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.80704415589571, "epoch": 0.00648, "grad_norm": 0.0832974910736084, "kl": 0.48960766196250916, "learning_rate": 7.99987712116802e-06, "loss": -0.0512, "step": 324, "step_time": 12.87827012798516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.689655303955078, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3969721496105194, "epoch": 0.0065, "frac_reward_zero_std": 0.25, "grad_norm": 0.02916131168603897, "kl": 0.543068453669548, "learning_rate": 7.999876266367207e-06, "loss": -0.077, "num_tokens": 7391408.0, "reward": 0.7330843806266785, "reward_std": 0.8886751532554626, "rewards/rollout_reward_func/mean": 0.7330843806266785, "rewards/rollout_reward_func/std": 0.8886751532554626, "sampling/importance_sampling_ratio/max": 1.5371683835983276, "sampling/importance_sampling_ratio/mean": 0.9176636934280396, "sampling/importance_sampling_ratio/min": 0.00017414696048945189, "sampling/sampling_logp_difference/max": 1.9598803520202637, "sampling/sampling_logp_difference/mean": 0.24176424741744995, "step": 325, "step_time": 25.290279532026034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3916460275650024, "epoch": 0.00652, "grad_norm": 0.02837793156504631, "kl": 0.530295729637146, "learning_rate": 7.99987540860356e-06, "loss": -0.0771, "step": 326, "step_time": 13.525917347986251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9752038419246674, "epoch": 0.00654, "frac_reward_zero_std": 0.25, "grad_norm": 0.06856926530599594, "kl": 0.29392109811306, "learning_rate": 7.999874547877082e-06, "loss": -0.0565, "num_tokens": 7444255.0, "reward": 0.881231963634491, "reward_std": 0.7776545286178589, "rewards/rollout_reward_func/mean": 0.881231963634491, "rewards/rollout_reward_func/std": 0.7776545286178589, "sampling/importance_sampling_ratio/max": 1.4604765176773071, "sampling/importance_sampling_ratio/mean": 0.9684019088745117, "sampling/importance_sampling_ratio/min": 1.044754208123777e-05, "sampling/sampling_logp_difference/max": 1.8780715465545654, "sampling/sampling_logp_difference/mean": 0.16624245047569275, "step": 327, "step_time": 20.19447537799715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9750963449478149, "epoch": 0.00656, "grad_norm": 0.06838110834360123, "kl": 0.30392326042056084, "learning_rate": 7.999873684187772e-06, "loss": -0.0567, "step": 328, "step_time": 10.22537670203019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.0714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.51962311565876, "epoch": 0.00658, "frac_reward_zero_std": 0.125, "grad_norm": 0.11585653573274612, "kl": 0.7692796215415001, "learning_rate": 7.999872817535633e-06, "loss": -0.0735, "num_tokens": 7496295.0, "reward": 0.7514026761054993, "reward_std": 0.6472116708755493, "rewards/rollout_reward_func/mean": 0.7514026761054993, "rewards/rollout_reward_func/std": 0.6472116708755493, "sampling/importance_sampling_ratio/max": 1.5751084089279175, "sampling/importance_sampling_ratio/mean": 0.8720256686210632, "sampling/importance_sampling_ratio/min": 2.9029873616082114e-08, "sampling/sampling_logp_difference/max": 2.105379104614258, "sampling/sampling_logp_difference/mean": 0.3368946611881256, "step": 329, "step_time": 21.914094019972254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5260104089975357, "epoch": 0.0066, "grad_norm": 0.1121574342250824, "kl": 0.7787401080131531, "learning_rate": 7.999871947920665e-06, "loss": -0.0737, "step": 330, "step_time": 12.095752967958106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.297052040696144, "epoch": 0.00662, "frac_reward_zero_std": 0.0, "grad_norm": 0.09187024086713791, "kl": 0.9325205609202385, "learning_rate": 7.999871075342866e-06, "loss": -0.071, "num_tokens": 7544218.0, "reward": 0.7132897973060608, "reward_std": 0.8660687208175659, "rewards/rollout_reward_func/mean": 0.7132897973060608, "rewards/rollout_reward_func/std": 0.8660687208175659, "sampling/importance_sampling_ratio/max": 1.6893556118011475, "sampling/importance_sampling_ratio/mean": 0.7320876717567444, "sampling/importance_sampling_ratio/min": 5.881372999283485e-06, "sampling/sampling_logp_difference/max": 2.0822160243988037, "sampling/sampling_logp_difference/mean": 0.2916436195373535, "step": 331, "step_time": 22.231891136965714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2985321283340454, "epoch": 0.00664, "grad_norm": 0.0877152606844902, "kl": 0.9243316948413849, "learning_rate": 7.999870199802242e-06, "loss": -0.0712, "step": 332, "step_time": 12.11594474100275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1101007722318172, "epoch": 0.00666, "frac_reward_zero_std": 0.125, "grad_norm": 0.061405960470438004, "kl": 0.40830013900995255, "learning_rate": 7.999869321298789e-06, "loss": -0.0638, "num_tokens": 7596977.0, "reward": 0.703101634979248, "reward_std": 0.8244979977607727, "rewards/rollout_reward_func/mean": 0.703101634979248, "rewards/rollout_reward_func/std": 0.8244979977607727, "sampling/importance_sampling_ratio/max": 1.4280189275741577, "sampling/importance_sampling_ratio/mean": 0.891697883605957, "sampling/importance_sampling_ratio/min": 1.9850233456963906e-06, "sampling/sampling_logp_difference/max": 2.6761393547058105, "sampling/sampling_logp_difference/mean": 0.21703433990478516, "step": 333, "step_time": 23.662240714009386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1116054952144623, "epoch": 0.00668, "grad_norm": 0.05798480287194252, "kl": 0.41490086913108826, "learning_rate": 7.999868439832512e-06, "loss": -0.0638, "step": 334, "step_time": 12.540040646039415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9538232237100601, "epoch": 0.0067, "frac_reward_zero_std": 0.125, "grad_norm": 0.1611814945936203, "kl": 1.5211142301559448, "learning_rate": 7.999867555403407e-06, "loss": -0.0406, "num_tokens": 7651625.0, "reward": 0.8150229454040527, "reward_std": 0.7907713055610657, "rewards/rollout_reward_func/mean": 0.8150229454040527, "rewards/rollout_reward_func/std": 0.7907713055610657, "sampling/importance_sampling_ratio/max": 1.7883696556091309, "sampling/importance_sampling_ratio/mean": 0.915188729763031, "sampling/importance_sampling_ratio/min": 2.0317353119025938e-05, "sampling/sampling_logp_difference/max": 2.369068145751953, "sampling/sampling_logp_difference/mean": 0.22742924094200134, "step": 335, "step_time": 23.351020422036527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9553153067827225, "epoch": 0.00672, "grad_norm": 0.14993251860141754, "kl": 1.4251667335629463, "learning_rate": 7.999866668011482e-06, "loss": -0.0408, "step": 336, "step_time": 12.474325802962994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.806451320648193, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.938871406018734, "epoch": 0.00674, "frac_reward_zero_std": 0.125, "grad_norm": 0.06671057641506195, "kl": 0.7608001232147217, "learning_rate": 7.999865777656731e-06, "loss": -0.0332, "num_tokens": 7709515.0, "reward": 0.6800035834312439, "reward_std": 0.7629060745239258, "rewards/rollout_reward_func/mean": 0.6800035834312439, "rewards/rollout_reward_func/std": 0.7629060745239258, "sampling/importance_sampling_ratio/max": 1.474482774734497, "sampling/importance_sampling_ratio/mean": 0.9714881181716919, "sampling/importance_sampling_ratio/min": 1.7841202861745842e-05, "sampling/sampling_logp_difference/max": 2.6569464206695557, "sampling/sampling_logp_difference/mean": 0.23141354322433472, "step": 337, "step_time": 23.713723011984257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9480154290795326, "epoch": 0.00676, "grad_norm": 0.06537723541259766, "kl": 0.756986603140831, "learning_rate": 7.999864884339157e-06, "loss": -0.0331, "step": 338, "step_time": 13.002296422986547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.111111164093018, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.272420421242714, "epoch": 0.00678, "frac_reward_zero_std": 0.125, "grad_norm": 0.04333522543311119, "kl": 0.5719372034072876, "learning_rate": 7.999863988058763e-06, "loss": -0.0633, "num_tokens": 7762112.0, "reward": 0.8861257433891296, "reward_std": 0.7058530449867249, "rewards/rollout_reward_func/mean": 0.8861257433891296, "rewards/rollout_reward_func/std": 0.7058529853820801, "sampling/importance_sampling_ratio/max": 1.2847965955734253, "sampling/importance_sampling_ratio/mean": 0.8505656719207764, "sampling/importance_sampling_ratio/min": 7.447052894349326e-07, "sampling/sampling_logp_difference/max": 2.5825533866882324, "sampling/sampling_logp_difference/mean": 0.25045087933540344, "step": 339, "step_time": 24.201431594992755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.28326865285635, "epoch": 0.0068, "grad_norm": 0.04235270991921425, "kl": 0.5252508968114853, "learning_rate": 7.999863088815548e-06, "loss": -0.0633, "step": 340, "step_time": 13.541057641006773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 4.0625, "completions/mean_terminated_length": 4.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5271020531654358, "epoch": 0.00682, "frac_reward_zero_std": 0.125, "grad_norm": 0.10553260147571564, "kl": 0.4584100991487503, "learning_rate": 7.999862186609512e-06, "loss": -0.0262, "num_tokens": 7820234.0, "reward": 0.5611404180526733, "reward_std": 0.6224525570869446, "rewards/rollout_reward_func/mean": 0.5611404180526733, "rewards/rollout_reward_func/std": 0.6224524974822998, "sampling/importance_sampling_ratio/max": 2.7978179454803467, "sampling/importance_sampling_ratio/mean": 1.1250550746917725, "sampling/importance_sampling_ratio/min": 0.03702221065759659, "sampling/sampling_logp_difference/max": 1.9300564527511597, "sampling/sampling_logp_difference/mean": 0.13006773591041565, "step": 341, "step_time": 24.52369869503309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5421755239367485, "epoch": 0.00684, "grad_norm": 0.1047242283821106, "kl": 0.4427836090326309, "learning_rate": 7.999861281440659e-06, "loss": -0.0262, "step": 342, "step_time": 13.703704695974011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.679999828338623, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0073390007019043, "epoch": 0.00686, "frac_reward_zero_std": 0.125, "grad_norm": 0.09367377310991287, "kl": 0.29430690221488476, "learning_rate": 7.999860373308985e-06, "loss": -0.0645, "num_tokens": 7875905.0, "reward": 0.5877150297164917, "reward_std": 0.7990611791610718, "rewards/rollout_reward_func/mean": 0.5877150297164917, "rewards/rollout_reward_func/std": 0.7990611791610718, "sampling/importance_sampling_ratio/max": 2.31071400642395, "sampling/importance_sampling_ratio/mean": 0.7895113825798035, "sampling/importance_sampling_ratio/min": 1.3546212151993586e-08, "sampling/sampling_logp_difference/max": 2.578260660171509, "sampling/sampling_logp_difference/mean": 0.3753862977027893, "step": 343, "step_time": 24.722007953969296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0126174241304398, "epoch": 0.00688, "grad_norm": 0.08471594005823135, "kl": 0.2978045344352722, "learning_rate": 7.999859462214496e-06, "loss": -0.0646, "step": 344, "step_time": 13.131701047997922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 5.178571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4711092486977577, "epoch": 0.0069, "frac_reward_zero_std": 0.125, "grad_norm": 0.07303572446107864, "kl": 0.5310560911893845, "learning_rate": 7.999858548157192e-06, "loss": -0.0345, "num_tokens": 7932342.0, "reward": 0.4820771813392639, "reward_std": 0.756584644317627, "rewards/rollout_reward_func/mean": 0.4820771813392639, "rewards/rollout_reward_func/std": 0.756584644317627, "sampling/importance_sampling_ratio/max": 1.480770468711853, "sampling/importance_sampling_ratio/mean": 0.7506901025772095, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.730863332748413, "sampling/sampling_logp_difference/mean": 0.25526556372642517, "step": 345, "step_time": 24.61879976899945 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.4769241958856583, "epoch": 0.00692, "grad_norm": 0.06699428707361221, "kl": 0.5083408057689667, "learning_rate": 7.99985763113707e-06, "loss": -0.0347, "step": 346, "step_time": 12.905095941008767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 5.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.515817791223526, "epoch": 0.00694, "frac_reward_zero_std": 0.0, "grad_norm": 0.06783199310302734, "kl": 0.3779159225523472, "learning_rate": 7.999856711154135e-06, "loss": -0.0538, "num_tokens": 7979020.0, "reward": 1.0582793951034546, "reward_std": 0.6348252296447754, "rewards/rollout_reward_func/mean": 1.0582793951034546, "rewards/rollout_reward_func/std": 0.6348252296447754, "sampling/importance_sampling_ratio/max": 1.3703809976577759, "sampling/importance_sampling_ratio/mean": 0.8896775245666504, "sampling/importance_sampling_ratio/min": 1.2987501918360067e-07, "sampling/sampling_logp_difference/max": 2.3146865367889404, "sampling/sampling_logp_difference/mean": 0.26635295152664185, "step": 347, "step_time": 21.437922459008405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5228061378002167, "epoch": 0.00696, "grad_norm": 0.06889066845178604, "kl": 0.3616091161966324, "learning_rate": 7.999855788208386e-06, "loss": -0.0538, "step": 348, "step_time": 11.668684171047062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5588956624269485, "epoch": 0.00698, "frac_reward_zero_std": 0.0, "grad_norm": 0.07355163991451263, "kl": 0.2498638667166233, "learning_rate": 7.999854862299824e-06, "loss": -0.0634, "num_tokens": 8038514.0, "reward": 0.5516186952590942, "reward_std": 0.791168749332428, "rewards/rollout_reward_func/mean": 0.5516186952590942, "rewards/rollout_reward_func/std": 0.791168749332428, "sampling/importance_sampling_ratio/max": 1.5197269916534424, "sampling/importance_sampling_ratio/mean": 0.78505539894104, "sampling/importance_sampling_ratio/min": 3.666276461444795e-05, "sampling/sampling_logp_difference/max": 1.9843379259109497, "sampling/sampling_logp_difference/mean": 0.2665156126022339, "step": 349, "step_time": 24.980856309004594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.559874027967453, "epoch": 0.007, "grad_norm": 0.07040847837924957, "kl": 0.25592082366347313, "learning_rate": 7.999853933428452e-06, "loss": -0.0636, "step": 350, "step_time": 13.15883475804003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 4.730769634246826, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9031165838241577, "epoch": 0.00702, "frac_reward_zero_std": 0.125, "grad_norm": 0.06707148998975754, "kl": 0.31520311906933784, "learning_rate": 7.999853001594268e-06, "loss": -0.0743, "num_tokens": 8087402.0, "reward": 0.6722483038902283, "reward_std": 0.8249081373214722, "rewards/rollout_reward_func/mean": 0.6722483038902283, "rewards/rollout_reward_func/std": 0.8249081373214722, "sampling/importance_sampling_ratio/max": 1.572448968887329, "sampling/importance_sampling_ratio/mean": 0.8697270750999451, "sampling/importance_sampling_ratio/min": 1.41638445416703e-09, "sampling/sampling_logp_difference/max": 2.3268091678619385, "sampling/sampling_logp_difference/mean": 0.3698841333389282, "step": 351, "step_time": 22.102839596947888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.900640293955803, "epoch": 0.00704, "grad_norm": 0.06320390850305557, "kl": 0.32358620315790176, "learning_rate": 7.999852066797274e-06, "loss": -0.0746, "step": 352, "step_time": 11.825915478024399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.867482304573059, "epoch": 0.00706, "frac_reward_zero_std": 0.0, "grad_norm": 0.08855780214071274, "kl": 1.1264195516705513, "learning_rate": 7.999851129037472e-06, "loss": -0.043, "num_tokens": 8141055.0, "reward": 0.3600400984287262, "reward_std": 0.7623745203018188, "rewards/rollout_reward_func/mean": 0.3600400984287262, "rewards/rollout_reward_func/std": 0.7623745203018188, "sampling/importance_sampling_ratio/max": 1.4026068449020386, "sampling/importance_sampling_ratio/mean": 0.633857250213623, "sampling/importance_sampling_ratio/min": 3.984026477610314e-07, "sampling/sampling_logp_difference/max": 2.4995243549346924, "sampling/sampling_logp_difference/mean": 0.32260599732398987, "step": 353, "step_time": 25.095212527987314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.866428554058075, "epoch": 0.00708, "grad_norm": 0.08875814825296402, "kl": 1.1365433558821678, "learning_rate": 7.999850188314861e-06, "loss": -0.043, "step": 354, "step_time": 12.78017233000719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 5.137930870056152, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4534114003181458, "epoch": 0.0071, "frac_reward_zero_std": 0.125, "grad_norm": 0.05585667863488197, "kl": 0.5313124880194664, "learning_rate": 7.999849244629442e-06, "loss": -0.0819, "num_tokens": 8189574.0, "reward": 0.6939566135406494, "reward_std": 0.817089319229126, "rewards/rollout_reward_func/mean": 0.6939566135406494, "rewards/rollout_reward_func/std": 0.8170892596244812, "sampling/importance_sampling_ratio/max": 1.3417613506317139, "sampling/importance_sampling_ratio/mean": 0.8326842784881592, "sampling/importance_sampling_ratio/min": 0.0010544456308707595, "sampling/sampling_logp_difference/max": 1.8658816814422607, "sampling/sampling_logp_difference/mean": 0.2354387789964676, "step": 355, "step_time": 23.89208416006295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4531855285167694, "epoch": 0.00712, "grad_norm": 0.05349764972925186, "kl": 0.5426516458392143, "learning_rate": 7.99984829798122e-06, "loss": -0.0821, "step": 356, "step_time": 12.933608156017726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6197320967912674, "epoch": 0.00714, "frac_reward_zero_std": 0.125, "grad_norm": 0.06978244334459305, "kl": 0.5752751305699348, "learning_rate": 7.99984734837019e-06, "loss": -0.052, "num_tokens": 8243968.0, "reward": 0.6783187389373779, "reward_std": 0.7944909334182739, "rewards/rollout_reward_func/mean": 0.6783187389373779, "rewards/rollout_reward_func/std": 0.7944909334182739, "sampling/importance_sampling_ratio/max": 2.2708845138549805, "sampling/importance_sampling_ratio/mean": 0.8471074104309082, "sampling/importance_sampling_ratio/min": 6.373694350081394e-10, "sampling/sampling_logp_difference/max": 2.237537145614624, "sampling/sampling_logp_difference/mean": 0.33687424659729004, "step": 357, "step_time": 23.313504870980978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6154157519340515, "epoch": 0.00716, "grad_norm": 0.06860467046499252, "kl": 0.5879784785211086, "learning_rate": 7.999846395796358e-06, "loss": -0.0521, "step": 358, "step_time": 12.398309109004913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7045955806970596, "epoch": 0.00718, "frac_reward_zero_std": 0.25, "grad_norm": 0.038150202482938766, "kl": 0.44286154955625534, "learning_rate": 7.999845440259722e-06, "loss": -0.0478, "num_tokens": 8289975.0, "reward": 0.808021605014801, "reward_std": 0.8837414979934692, "rewards/rollout_reward_func/mean": 0.808021605014801, "rewards/rollout_reward_func/std": 0.8837414979934692, "sampling/importance_sampling_ratio/max": 1.2824339866638184, "sampling/importance_sampling_ratio/mean": 0.9514179229736328, "sampling/importance_sampling_ratio/min": 0.0031613509636372328, "sampling/sampling_logp_difference/max": 1.2385210990905762, "sampling/sampling_logp_difference/mean": 0.15722262859344482, "step": 359, "step_time": 21.161345951026306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7000968158245087, "epoch": 0.0072, "grad_norm": 0.035718824714422226, "kl": 0.4308250918984413, "learning_rate": 7.999844481760283e-06, "loss": -0.0478, "step": 360, "step_time": 11.119505367038073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 5.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1953302510082722, "epoch": 0.00722, "frac_reward_zero_std": 0.25, "grad_norm": 0.11979188024997711, "kl": 0.5231259688735008, "learning_rate": 7.999843520298042e-06, "loss": -0.054, "num_tokens": 8345603.0, "reward": 0.6429883241653442, "reward_std": 0.7599404454231262, "rewards/rollout_reward_func/mean": 0.6429883241653442, "rewards/rollout_reward_func/std": 0.7599404454231262, "sampling/importance_sampling_ratio/max": 1.8055838346481323, "sampling/importance_sampling_ratio/mean": 0.8854401707649231, "sampling/importance_sampling_ratio/min": 7.933273082016967e-06, "sampling/sampling_logp_difference/max": 1.9983799457550049, "sampling/sampling_logp_difference/mean": 0.22927197813987732, "step": 361, "step_time": 23.920724905037787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1957609057426453, "epoch": 0.00724, "grad_norm": 0.11951957643032074, "kl": 0.539969265460968, "learning_rate": 7.999842555873e-06, "loss": -0.0538, "step": 362, "step_time": 12.237502133008093 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.004310344811528921, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004310344811528921, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2438772916793823, "epoch": 0.00726, "frac_reward_zero_std": 0.125, "grad_norm": 0.3462856113910675, "kl": 1.304588295519352, "learning_rate": 7.99984158848516e-06, "loss": -0.0638, "num_tokens": 8391911.0, "reward": 0.2949181795120239, "reward_std": 0.8050936460494995, "rewards/rollout_reward_func/mean": 0.2949181795120239, "rewards/rollout_reward_func/std": 0.8050935864448547, "sampling/importance_sampling_ratio/max": 1.6840306520462036, "sampling/importance_sampling_ratio/mean": 0.8508332371711731, "sampling/importance_sampling_ratio/min": 0.00014727585949003696, "sampling/sampling_logp_difference/max": 1.968376636505127, "sampling/sampling_logp_difference/mean": 0.2049977332353592, "step": 363, "step_time": 22.668916549999267 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2479091882705688, "epoch": 0.00728, "grad_norm": 0.17155557870864868, "kl": 1.2931620478630066, "learning_rate": 7.999840618134521e-06, "loss": -0.0652, "step": 364, "step_time": 11.727799961954588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6625571697950363, "epoch": 0.0073, "frac_reward_zero_std": 0.25, "grad_norm": 0.06933950632810593, "kl": 0.5115023106336594, "learning_rate": 7.999839644821086e-06, "loss": -0.0467, "num_tokens": 8438380.0, "reward": 0.5852969884872437, "reward_std": 0.844785213470459, "rewards/rollout_reward_func/mean": 0.5852969884872437, "rewards/rollout_reward_func/std": 0.844785213470459, "sampling/importance_sampling_ratio/max": 1.2537869215011597, "sampling/importance_sampling_ratio/mean": 0.810045063495636, "sampling/importance_sampling_ratio/min": 4.373525825940305e-06, "sampling/sampling_logp_difference/max": 2.1268978118896484, "sampling/sampling_logp_difference/mean": 0.29358047246932983, "step": 365, "step_time": 21.80918214202393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.6634398251771927, "epoch": 0.00732, "grad_norm": 0.06058584153652191, "kl": 0.5077074691653252, "learning_rate": 7.999838668544853e-06, "loss": -0.0468, "step": 366, "step_time": 11.876421103952453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5458728969097137, "epoch": 0.00734, "frac_reward_zero_std": 0.125, "grad_norm": 0.044983379542827606, "kl": 0.2464906983077526, "learning_rate": 7.999837689305823e-06, "loss": -0.0325, "num_tokens": 8489694.0, "reward": 0.38989314436912537, "reward_std": 0.8096694350242615, "rewards/rollout_reward_func/mean": 0.38989314436912537, "rewards/rollout_reward_func/std": 0.8096694350242615, "sampling/importance_sampling_ratio/max": 1.4397647380828857, "sampling/importance_sampling_ratio/mean": 0.8383220434188843, "sampling/importance_sampling_ratio/min": 6.765895250282483e-06, "sampling/sampling_logp_difference/max": 2.486940383911133, "sampling/sampling_logp_difference/mean": 0.22833098471164703, "step": 367, "step_time": 23.905339919991093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5524750351905823, "epoch": 0.00736, "grad_norm": 0.04951292648911476, "kl": 0.24566977843642235, "learning_rate": 7.999836707104e-06, "loss": -0.0326, "step": 368, "step_time": 12.223930330015719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.043141186237335, "epoch": 0.00738, "frac_reward_zero_std": 0.25, "grad_norm": 0.09080897271633148, "kl": 0.41951508074998856, "learning_rate": 7.999835721939381e-06, "loss": -0.0658, "num_tokens": 8534047.0, "reward": 0.427121639251709, "reward_std": 0.9345726370811462, "rewards/rollout_reward_func/mean": 0.427121639251709, "rewards/rollout_reward_func/std": 0.9345725178718567, "sampling/importance_sampling_ratio/max": 1.4184811115264893, "sampling/importance_sampling_ratio/mean": 0.7594064474105835, "sampling/importance_sampling_ratio/min": 9.302125292265373e-09, "sampling/sampling_logp_difference/max": 2.5930514335632324, "sampling/sampling_logp_difference/mean": 0.3631237745285034, "step": 369, "step_time": 18.899121093010763 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 2.0503616631031036, "epoch": 0.0074, "grad_norm": 0.08765944093465805, "kl": 0.38717184215784073, "learning_rate": 7.999834733811973e-06, "loss": -0.0661, "step": 370, "step_time": 9.841807652002899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3060076534748077, "epoch": 0.00742, "frac_reward_zero_std": 0.0, "grad_norm": 0.09543472528457642, "kl": 0.39830172061920166, "learning_rate": 7.999833742721771e-06, "loss": -0.0795, "num_tokens": 8584738.0, "reward": 0.6751289367675781, "reward_std": 0.8568448424339294, "rewards/rollout_reward_func/mean": 0.6751289367675781, "rewards/rollout_reward_func/std": 0.8568448424339294, "sampling/importance_sampling_ratio/max": 1.6326005458831787, "sampling/importance_sampling_ratio/mean": 0.7716986536979675, "sampling/importance_sampling_ratio/min": 0.00015072370297275484, "sampling/sampling_logp_difference/max": 1.8023761510849, "sampling/sampling_logp_difference/mean": 0.24578237533569336, "step": 371, "step_time": 19.886422619019868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3129049390554428, "epoch": 0.00744, "grad_norm": 0.0767415463924408, "kl": 0.40740828961133957, "learning_rate": 7.999832748668778e-06, "loss": -0.0798, "step": 372, "step_time": 10.693443933007075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 5.608695983886719, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0816484093666077, "epoch": 0.00746, "frac_reward_zero_std": 0.125, "grad_norm": 0.06191566586494446, "kl": 0.1522895023226738, "learning_rate": 7.999831751652994e-06, "loss": -0.086, "num_tokens": 8634478.0, "reward": 0.583733320236206, "reward_std": 0.9114809632301331, "rewards/rollout_reward_func/mean": 0.583733320236206, "rewards/rollout_reward_func/std": 0.9114810228347778, "sampling/importance_sampling_ratio/max": 1.3565562963485718, "sampling/importance_sampling_ratio/mean": 0.683165967464447, "sampling/importance_sampling_ratio/min": 5.433411160993273e-07, "sampling/sampling_logp_difference/max": 1.9433631896972656, "sampling/sampling_logp_difference/mean": 0.31784945726394653, "step": 373, "step_time": 26.453685745975235 }, { "clip_ratio/high_max": 0.008064515888690948, "clip_ratio/high_mean": 0.004032257944345474, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004032257944345474, "entropy": 2.086657553911209, "epoch": 0.00748, "grad_norm": 0.06487256288528442, "kl": 0.14979607611894608, "learning_rate": 7.999830751674423e-06, "loss": -0.086, "step": 374, "step_time": 12.632899172022007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3407168835401535, "epoch": 0.0075, "frac_reward_zero_std": 0.125, "grad_norm": 0.19358094036579132, "kl": 0.4211149401962757, "learning_rate": 7.999829748733065e-06, "loss": -0.0627, "num_tokens": 8686066.0, "reward": 0.3820514678955078, "reward_std": 0.8413142561912537, "rewards/rollout_reward_func/mean": 0.3820514678955078, "rewards/rollout_reward_func/std": 0.8413142561912537, "sampling/importance_sampling_ratio/max": 1.5388641357421875, "sampling/importance_sampling_ratio/mean": 0.76903235912323, "sampling/importance_sampling_ratio/min": 0.0002324568195035681, "sampling/sampling_logp_difference/max": 1.756780982017517, "sampling/sampling_logp_difference/mean": 0.23484477400779724, "step": 375, "step_time": 24.063645026966697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.336265742778778, "epoch": 0.00752, "grad_norm": 0.17401067912578583, "kl": 0.432314470410347, "learning_rate": 7.999828742828919e-06, "loss": -0.0632, "step": 376, "step_time": 12.709263963013655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.052228331565857, "epoch": 0.00754, "frac_reward_zero_std": 0.0, "grad_norm": 0.12207590788602829, "kl": 0.3159501254558563, "learning_rate": 7.999827733961987e-06, "loss": -0.0675, "num_tokens": 8742812.0, "reward": 0.3303331136703491, "reward_std": 0.8506849408149719, "rewards/rollout_reward_func/mean": 0.3303331136703491, "rewards/rollout_reward_func/std": 0.8506849408149719, "sampling/importance_sampling_ratio/max": 1.5153948068618774, "sampling/importance_sampling_ratio/mean": 0.7163785696029663, "sampling/importance_sampling_ratio/min": 3.168229159200564e-05, "sampling/sampling_logp_difference/max": 1.7538241147994995, "sampling/sampling_logp_difference/mean": 0.2990599274635315, "step": 377, "step_time": 24.645720799977425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0463470816612244, "epoch": 0.00756, "grad_norm": 0.11996635794639587, "kl": 0.32206495478749275, "learning_rate": 7.99982672213227e-06, "loss": -0.0675, "step": 378, "step_time": 12.865013092989102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4960489012300968, "epoch": 0.00758, "frac_reward_zero_std": 0.125, "grad_norm": 0.2112269550561905, "kl": 1.0649003311991692, "learning_rate": 7.999825707339768e-06, "loss": -0.0707, "num_tokens": 8795405.0, "reward": 0.5410743951797485, "reward_std": 0.828346848487854, "rewards/rollout_reward_func/mean": 0.5410743951797485, "rewards/rollout_reward_func/std": 0.828346848487854, "sampling/importance_sampling_ratio/max": 2.0639443397521973, "sampling/importance_sampling_ratio/mean": 0.828582763671875, "sampling/importance_sampling_ratio/min": 3.881767497659894e-06, "sampling/sampling_logp_difference/max": 3.617616653442383, "sampling/sampling_logp_difference/mean": 0.27786338329315186, "step": 379, "step_time": 23.73575527506182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4949759989976883, "epoch": 0.0076, "grad_norm": 0.19248850643634796, "kl": 0.9944926872849464, "learning_rate": 7.999824689584484e-06, "loss": -0.0716, "step": 380, "step_time": 12.837385257007554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 5.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0869329124689102, "epoch": 0.00762, "frac_reward_zero_std": 0.25, "grad_norm": 0.043177537620067596, "kl": 0.402957446873188, "learning_rate": 7.99982366886642e-06, "loss": -0.0558, "num_tokens": 8844657.0, "reward": 0.5984599590301514, "reward_std": 0.9798431396484375, "rewards/rollout_reward_func/mean": 0.5984599590301514, "rewards/rollout_reward_func/std": 0.9798431396484375, "sampling/importance_sampling_ratio/max": 1.5018279552459717, "sampling/importance_sampling_ratio/mean": 0.8600393533706665, "sampling/importance_sampling_ratio/min": 0.004939338192343712, "sampling/sampling_logp_difference/max": 2.8685994148254395, "sampling/sampling_logp_difference/mean": 0.17338788509368896, "step": 381, "step_time": 21.623297156009357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.08101224899292, "epoch": 0.00764, "grad_norm": 0.03715679422020912, "kl": 0.3788239508867264, "learning_rate": 7.999822645185573e-06, "loss": -0.0559, "step": 382, "step_time": 11.40179177801474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 5.26086950302124, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7843021750450134, "epoch": 0.00766, "frac_reward_zero_std": 0.125, "grad_norm": 0.10360146313905716, "kl": 0.2993882792070508, "learning_rate": 7.999821618541947e-06, "loss": -0.0593, "num_tokens": 8899659.0, "reward": 0.3428257405757904, "reward_std": 0.9244809150695801, "rewards/rollout_reward_func/mean": 0.3428257405757904, "rewards/rollout_reward_func/std": 0.9244809150695801, "sampling/importance_sampling_ratio/max": 1.6477843523025513, "sampling/importance_sampling_ratio/mean": 0.7541524171829224, "sampling/importance_sampling_ratio/min": 1.378813885821728e-05, "sampling/sampling_logp_difference/max": 1.9044718742370605, "sampling/sampling_logp_difference/mean": 0.2597862184047699, "step": 383, "step_time": 24.91516594699351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7837929129600525, "epoch": 0.00768, "grad_norm": 0.10290965437889099, "kl": 0.2940393500030041, "learning_rate": 7.999820588935541e-06, "loss": -0.0594, "step": 384, "step_time": 12.718487304955488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6846184730529785, "epoch": 0.0077, "frac_reward_zero_std": 0.0, "grad_norm": 0.09452436119318008, "kl": 0.3594524525105953, "learning_rate": 7.999819556366359e-06, "loss": -0.0477, "num_tokens": 8955083.0, "reward": 0.668657124042511, "reward_std": 0.8259575963020325, "rewards/rollout_reward_func/mean": 0.668657124042511, "rewards/rollout_reward_func/std": 0.8259575366973877, "sampling/importance_sampling_ratio/max": 1.5655473470687866, "sampling/importance_sampling_ratio/mean": 0.9044086933135986, "sampling/importance_sampling_ratio/min": 0.00014560185081791133, "sampling/sampling_logp_difference/max": 1.8622331619262695, "sampling/sampling_logp_difference/mean": 0.27400198578834534, "step": 385, "step_time": 24.35115688899532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6817318499088287, "epoch": 0.00772, "grad_norm": 0.09555728733539581, "kl": 0.36412991397082806, "learning_rate": 7.999818520834398e-06, "loss": -0.0477, "step": 386, "step_time": 13.074046984984307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2830723375082016, "epoch": 0.00774, "frac_reward_zero_std": 0.25, "grad_norm": 0.04153033345937729, "kl": 0.31804973632097244, "learning_rate": 7.999817482339663e-06, "loss": -0.0401, "num_tokens": 9007360.0, "reward": 0.5404846668243408, "reward_std": 0.8249536752700806, "rewards/rollout_reward_func/mean": 0.5404846668243408, "rewards/rollout_reward_func/std": 0.8249536752700806, "sampling/importance_sampling_ratio/max": 1.4134823083877563, "sampling/importance_sampling_ratio/mean": 0.8419496417045593, "sampling/importance_sampling_ratio/min": 3.704570190166123e-06, "sampling/sampling_logp_difference/max": 1.5179847478866577, "sampling/sampling_logp_difference/mean": 0.23436427116394043, "step": 387, "step_time": 23.896338980994187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2821771055459976, "epoch": 0.00776, "grad_norm": 0.04179755970835686, "kl": 0.32170819118618965, "learning_rate": 7.999816440882153e-06, "loss": -0.0402, "step": 388, "step_time": 12.908465448010247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8958695456385612, "epoch": 0.00778, "frac_reward_zero_std": 0.25, "grad_norm": 0.05803319811820984, "kl": 0.5360071174800396, "learning_rate": 7.999815396461869e-06, "loss": -0.0394, "num_tokens": 9061944.0, "reward": 0.842174768447876, "reward_std": 0.7830149531364441, "rewards/rollout_reward_func/mean": 0.842174768447876, "rewards/rollout_reward_func/std": 0.7830149531364441, "sampling/importance_sampling_ratio/max": 1.56503427028656, "sampling/importance_sampling_ratio/mean": 0.9748367071151733, "sampling/importance_sampling_ratio/min": 5.102034350557005e-10, "sampling/sampling_logp_difference/max": 2.337951183319092, "sampling/sampling_logp_difference/mean": 0.24096350371837616, "step": 389, "step_time": 23.936904909001896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8953728228807449, "epoch": 0.0078, "grad_norm": 0.05609913542866707, "kl": 0.5124025046825409, "learning_rate": 7.99981434907881e-06, "loss": -0.0395, "step": 390, "step_time": 12.939568116009468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.37788724899292, "epoch": 0.00782, "frac_reward_zero_std": 0.25, "grad_norm": 0.06923040002584457, "kl": 0.30113303661346436, "learning_rate": 7.999813298732984e-06, "loss": -0.0582, "num_tokens": 9110306.0, "reward": 0.7236330509185791, "reward_std": 0.8284523487091064, "rewards/rollout_reward_func/mean": 0.7236330509185791, "rewards/rollout_reward_func/std": 0.8284523487091064, "sampling/importance_sampling_ratio/max": 1.569911003112793, "sampling/importance_sampling_ratio/mean": 0.8172758221626282, "sampling/importance_sampling_ratio/min": 0.00010964117245748639, "sampling/sampling_logp_difference/max": 2.0000650882720947, "sampling/sampling_logp_difference/mean": 0.21194612979888916, "step": 391, "step_time": 22.099482109973906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3794040381908417, "epoch": 0.00784, "grad_norm": 0.06714945286512375, "kl": 0.3022683784365654, "learning_rate": 7.999812245424385e-06, "loss": -0.0583, "step": 392, "step_time": 11.325063325028168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 5.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4092693030834198, "epoch": 0.00786, "frac_reward_zero_std": 0.0, "grad_norm": 0.06488002091646194, "kl": 0.35167688876390457, "learning_rate": 7.999811189153016e-06, "loss": -0.0646, "num_tokens": 9158656.0, "reward": 0.6813830137252808, "reward_std": 0.9415714144706726, "rewards/rollout_reward_func/mean": 0.6813830137252808, "rewards/rollout_reward_func/std": 0.9415714144706726, "sampling/importance_sampling_ratio/max": 1.6144458055496216, "sampling/importance_sampling_ratio/mean": 0.7336368560791016, "sampling/importance_sampling_ratio/min": 0.0007402427727356553, "sampling/sampling_logp_difference/max": 1.3249422311782837, "sampling/sampling_logp_difference/mean": 0.21281597018241882, "step": 393, "step_time": 24.489689962007105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4114397317171097, "epoch": 0.00788, "grad_norm": 0.0642082467675209, "kl": 0.3491496443748474, "learning_rate": 7.999810129918879e-06, "loss": -0.0648, "step": 394, "step_time": 12.771702302998165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 5.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5013773739337921, "epoch": 0.0079, "frac_reward_zero_std": 0.25, "grad_norm": 0.09363668411970139, "kl": 0.38228341192007065, "learning_rate": 7.999809067721974e-06, "loss": -0.0374, "num_tokens": 9207991.0, "reward": 0.656102180480957, "reward_std": 0.9176774024963379, "rewards/rollout_reward_func/mean": 0.656102180480957, "rewards/rollout_reward_func/std": 0.9176774024963379, "sampling/importance_sampling_ratio/max": 1.5327167510986328, "sampling/importance_sampling_ratio/mean": 0.8923162817955017, "sampling/importance_sampling_ratio/min": 1.9693006834131666e-05, "sampling/sampling_logp_difference/max": 2.3422985076904297, "sampling/sampling_logp_difference/mean": 0.2649548053741455, "step": 395, "step_time": 23.042844502982916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.50142103433609, "epoch": 0.00792, "grad_norm": 0.08377992361783981, "kl": 0.37490763515233994, "learning_rate": 7.999808002562303e-06, "loss": -0.0377, "step": 396, "step_time": 12.174337728065439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2204180359840393, "epoch": 0.00794, "frac_reward_zero_std": 0.375, "grad_norm": 0.05796537175774574, "kl": 0.3986216261982918, "learning_rate": 7.999806934439867e-06, "loss": -0.0205, "num_tokens": 9253723.0, "reward": 0.6256709694862366, "reward_std": 0.8519819974899292, "rewards/rollout_reward_func/mean": 0.6256709694862366, "rewards/rollout_reward_func/std": 0.8519819378852844, "sampling/importance_sampling_ratio/max": 1.5157586336135864, "sampling/importance_sampling_ratio/mean": 0.9507604241371155, "sampling/importance_sampling_ratio/min": 1.0387811926193535e-05, "sampling/sampling_logp_difference/max": 1.9863977432250977, "sampling/sampling_logp_difference/mean": 0.2318364679813385, "step": 397, "step_time": 22.882554781012004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2239589989185333, "epoch": 0.00796, "grad_norm": 0.058977238833904266, "kl": 0.3821873962879181, "learning_rate": 7.999805863354665e-06, "loss": -0.0206, "step": 398, "step_time": 12.616248637990793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6631081998348236, "epoch": 0.00798, "frac_reward_zero_std": 0.25, "grad_norm": 0.10495145618915558, "kl": 0.4487977437674999, "learning_rate": 7.999804789306702e-06, "loss": -0.0257, "num_tokens": 9304906.0, "reward": 0.6314609050750732, "reward_std": 0.8694499731063843, "rewards/rollout_reward_func/mean": 0.6314609050750732, "rewards/rollout_reward_func/std": 0.8694500923156738, "sampling/importance_sampling_ratio/max": 1.2738759517669678, "sampling/importance_sampling_ratio/mean": 0.7874116897583008, "sampling/importance_sampling_ratio/min": 1.304692798242968e-09, "sampling/sampling_logp_difference/max": 2.1331229209899902, "sampling/sampling_logp_difference/mean": 0.31229323148727417, "step": 399, "step_time": 23.076182784017874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004310344811528921, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004310344811528921, "entropy": 1.672993779182434, "epoch": 0.008, "grad_norm": 0.10413692146539688, "kl": 0.4436335824429989, "learning_rate": 7.999803712295974e-06, "loss": -0.0258, "step": 400, "step_time": 12.02134020801168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.238095283508301, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.965275377035141, "epoch": 0.00802, "frac_reward_zero_std": 0.0, "grad_norm": 0.06644978374242783, "kl": 0.2943408042192459, "learning_rate": 7.999802632322487e-06, "loss": -0.0884, "num_tokens": 9358186.0, "reward": 0.36665159463882446, "reward_std": 0.9454484581947327, "rewards/rollout_reward_func/mean": 0.36665159463882446, "rewards/rollout_reward_func/std": 0.9454484581947327, "sampling/importance_sampling_ratio/max": 1.4194670915603638, "sampling/importance_sampling_ratio/mean": 0.6522266268730164, "sampling/importance_sampling_ratio/min": 3.448223287705332e-05, "sampling/sampling_logp_difference/max": 1.654649257659912, "sampling/sampling_logp_difference/mean": 0.26090288162231445, "step": 401, "step_time": 24.533732419018634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9670455753803253, "epoch": 0.00804, "grad_norm": 0.06662033498287201, "kl": 0.27897467091679573, "learning_rate": 7.999801549386239e-06, "loss": -0.0884, "step": 402, "step_time": 12.37930252100341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4149203151464462, "epoch": 0.00806, "frac_reward_zero_std": 0.375, "grad_norm": 0.11374464631080627, "kl": 0.31418564170598984, "learning_rate": 7.999800463487233e-06, "loss": -0.0675, "num_tokens": 9411412.0, "reward": 0.6766797304153442, "reward_std": 0.9457155466079712, "rewards/rollout_reward_func/mean": 0.6766797304153442, "rewards/rollout_reward_func/std": 0.9457155466079712, "sampling/importance_sampling_ratio/max": 2.1977944374084473, "sampling/importance_sampling_ratio/mean": 0.8660398125648499, "sampling/importance_sampling_ratio/min": 0.00018669954442884773, "sampling/sampling_logp_difference/max": 1.8782777786254883, "sampling/sampling_logp_difference/mean": 0.22099024057388306, "step": 403, "step_time": 26.909844716981752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4213159829378128, "epoch": 0.00808, "grad_norm": 0.11266753077507019, "kl": 0.3054322861135006, "learning_rate": 7.999799374625469e-06, "loss": -0.0677, "step": 404, "step_time": 13.100859470985597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.9375, "completions/mean_terminated_length": 5.238095283508301, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8458164259791374, "epoch": 0.0081, "frac_reward_zero_std": 0.125, "grad_norm": 0.12583516538143158, "kl": 0.7384599167853594, "learning_rate": 7.999798282800947e-06, "loss": -0.0697, "num_tokens": 9461416.0, "reward": 0.44568121433258057, "reward_std": 0.9217012524604797, "rewards/rollout_reward_func/mean": 0.44568121433258057, "rewards/rollout_reward_func/std": 0.9217012524604797, "sampling/importance_sampling_ratio/max": 1.5689375400543213, "sampling/importance_sampling_ratio/mean": 0.5849679708480835, "sampling/importance_sampling_ratio/min": 5.327981398295378e-06, "sampling/sampling_logp_difference/max": 2.3502726554870605, "sampling/sampling_logp_difference/mean": 0.30698078870773315, "step": 405, "step_time": 21.824432317022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8473533689975739, "epoch": 0.00812, "grad_norm": 0.12564218044281006, "kl": 0.764321316499263, "learning_rate": 7.99979718801367e-06, "loss": -0.0697, "step": 406, "step_time": 10.981467814010102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.641566812992096, "epoch": 0.00814, "frac_reward_zero_std": 0.125, "grad_norm": 0.05247064679861069, "kl": 0.265664990991354, "learning_rate": 7.999796090263637e-06, "loss": -0.0555, "num_tokens": 9510278.0, "reward": 0.7355389595031738, "reward_std": 0.8814152479171753, "rewards/rollout_reward_func/mean": 0.7355389595031738, "rewards/rollout_reward_func/std": 0.8814151883125305, "sampling/importance_sampling_ratio/max": 1.6155282258987427, "sampling/importance_sampling_ratio/mean": 0.8120545744895935, "sampling/importance_sampling_ratio/min": 3.5517505239113234e-06, "sampling/sampling_logp_difference/max": 1.8355698585510254, "sampling/sampling_logp_difference/mean": 0.2577299177646637, "step": 407, "step_time": 19.629784168006154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6395619213581085, "epoch": 0.00816, "grad_norm": 0.051846109330654144, "kl": 0.2759963572025299, "learning_rate": 7.999794989550851e-06, "loss": -0.0557, "step": 408, "step_time": 9.926651810994372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8418918699026108, "epoch": 0.00818, "frac_reward_zero_std": 0.0, "grad_norm": 0.06895598769187927, "kl": 0.32111676037311554, "learning_rate": 7.999793885875313e-06, "loss": -0.0962, "num_tokens": 9563271.0, "reward": 0.4858691692352295, "reward_std": 0.8512759208679199, "rewards/rollout_reward_func/mean": 0.4858691692352295, "rewards/rollout_reward_func/std": 0.8512759208679199, "sampling/importance_sampling_ratio/max": 1.8612840175628662, "sampling/importance_sampling_ratio/mean": 0.6980567574501038, "sampling/importance_sampling_ratio/min": 3.947255390812643e-05, "sampling/sampling_logp_difference/max": 1.9864046573638916, "sampling/sampling_logp_difference/mean": 0.28261038661003113, "step": 409, "step_time": 23.78075375701883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.8492402285337448, "epoch": 0.0082, "grad_norm": 0.05616135895252228, "kl": 0.3402685336768627, "learning_rate": 7.999792779237024e-06, "loss": -0.0965, "step": 410, "step_time": 12.126066424010787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8295876681804657, "epoch": 0.00822, "frac_reward_zero_std": 0.125, "grad_norm": 0.24020971357822418, "kl": 0.19181462936103344, "learning_rate": 7.999791669635984e-06, "loss": -0.0503, "num_tokens": 9617928.0, "reward": 0.4449858069419861, "reward_std": 0.9083369374275208, "rewards/rollout_reward_func/mean": 0.4449858069419861, "rewards/rollout_reward_func/std": 0.908336877822876, "sampling/importance_sampling_ratio/max": 1.4709240198135376, "sampling/importance_sampling_ratio/mean": 0.7108081579208374, "sampling/importance_sampling_ratio/min": 1.6664042050251737e-05, "sampling/sampling_logp_difference/max": 1.920841097831726, "sampling/sampling_logp_difference/mean": 0.25543212890625, "step": 411, "step_time": 25.855295840010513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.826476812362671, "epoch": 0.00824, "grad_norm": 0.2502501904964447, "kl": 0.19479945860803127, "learning_rate": 7.999790557072196e-06, "loss": -0.0511, "step": 412, "step_time": 12.398641671985388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 4.2916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6551187634468079, "epoch": 0.00826, "frac_reward_zero_std": 0.125, "grad_norm": 0.09712795168161392, "kl": 0.5276165828108788, "learning_rate": 7.999789441545657e-06, "loss": -0.0494, "num_tokens": 9670456.0, "reward": 0.6651156544685364, "reward_std": 0.9138508439064026, "rewards/rollout_reward_func/mean": 0.6651156544685364, "rewards/rollout_reward_func/std": 0.9138507843017578, "sampling/importance_sampling_ratio/max": 1.3188756704330444, "sampling/importance_sampling_ratio/mean": 0.7257462739944458, "sampling/importance_sampling_ratio/min": 6.112561823101714e-05, "sampling/sampling_logp_difference/max": 2.30033016204834, "sampling/sampling_logp_difference/mean": 0.2675890624523163, "step": 413, "step_time": 23.78976690597483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6529930531978607, "epoch": 0.00828, "grad_norm": 0.09433671087026596, "kl": 0.5401150602847338, "learning_rate": 7.999788323056372e-06, "loss": -0.0496, "step": 414, "step_time": 12.266143248038134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5920647233724594, "epoch": 0.0083, "frac_reward_zero_std": 0.0, "grad_norm": 0.09001532196998596, "kl": 0.730140931904316, "learning_rate": 7.999787201604342e-06, "loss": -0.0382, "num_tokens": 9725418.0, "reward": 0.35282036662101746, "reward_std": 0.7417213916778564, "rewards/rollout_reward_func/mean": 0.35282036662101746, "rewards/rollout_reward_func/std": 0.7417213320732117, "sampling/importance_sampling_ratio/max": 1.5724916458129883, "sampling/importance_sampling_ratio/mean": 0.7533508539199829, "sampling/importance_sampling_ratio/min": 2.4964651856862474e-06, "sampling/sampling_logp_difference/max": 1.8288640975952148, "sampling/sampling_logp_difference/mean": 0.3037843704223633, "step": 415, "step_time": 22.661683445010567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5921305865049362, "epoch": 0.00832, "grad_norm": 0.08756639063358307, "kl": 0.7109771370887756, "learning_rate": 7.999786077189566e-06, "loss": -0.0382, "step": 416, "step_time": 12.23047203803435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.034482955932617, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8480799347162247, "epoch": 0.00834, "frac_reward_zero_std": 0.25, "grad_norm": 0.12759990990161896, "kl": 0.6914780512452126, "learning_rate": 7.999784949812047e-06, "loss": -0.0318, "num_tokens": 9777616.0, "reward": 0.7270800471305847, "reward_std": 0.7570073008537292, "rewards/rollout_reward_func/mean": 0.7270800471305847, "rewards/rollout_reward_func/std": 0.7570073008537292, "sampling/importance_sampling_ratio/max": 1.5614110231399536, "sampling/importance_sampling_ratio/mean": 0.9234011173248291, "sampling/importance_sampling_ratio/min": 2.1485246179508977e-05, "sampling/sampling_logp_difference/max": 1.7658196687698364, "sampling/sampling_logp_difference/mean": 0.19338607788085938, "step": 417, "step_time": 22.018025028984994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8551185727119446, "epoch": 0.00836, "grad_norm": 0.12287060171365738, "kl": 0.6485683396458626, "learning_rate": 7.999783819471785e-06, "loss": -0.0318, "step": 418, "step_time": 11.961167160974583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1482404507696629, "epoch": 0.00838, "frac_reward_zero_std": 0.0, "grad_norm": 0.04987984150648117, "kl": 0.4728366360068321, "learning_rate": 7.999782686168783e-06, "loss": -0.0689, "num_tokens": 9833836.0, "reward": 0.7205054759979248, "reward_std": 0.7670256495475769, "rewards/rollout_reward_func/mean": 0.7205054759979248, "rewards/rollout_reward_func/std": 0.7670255899429321, "sampling/importance_sampling_ratio/max": 1.343017339706421, "sampling/importance_sampling_ratio/mean": 0.8606135845184326, "sampling/importance_sampling_ratio/min": 0.0004032946308143437, "sampling/sampling_logp_difference/max": 1.8830816745758057, "sampling/sampling_logp_difference/mean": 0.20758068561553955, "step": 419, "step_time": 23.103998143982608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1568411849439144, "epoch": 0.0084, "grad_norm": 0.05076693743467331, "kl": 0.4531935229897499, "learning_rate": 7.99978154990304e-06, "loss": -0.0689, "step": 420, "step_time": 12.640575571946101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.079999923706055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4812177866697311, "epoch": 0.00842, "frac_reward_zero_std": 0.125, "grad_norm": 0.07445847243070602, "kl": 0.3762430138885975, "learning_rate": 7.999780410674556e-06, "loss": -0.0776, "num_tokens": 9890829.0, "reward": 0.605161190032959, "reward_std": 0.879159152507782, "rewards/rollout_reward_func/mean": 0.605161190032959, "rewards/rollout_reward_func/std": 0.8791590929031372, "sampling/importance_sampling_ratio/max": 1.3403916358947754, "sampling/importance_sampling_ratio/mean": 0.7555752396583557, "sampling/importance_sampling_ratio/min": 1.6323427189490758e-05, "sampling/sampling_logp_difference/max": 2.0642313957214355, "sampling/sampling_logp_difference/mean": 0.2706795930862427, "step": 421, "step_time": 24.314286707056453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4822627753019333, "epoch": 0.00844, "grad_norm": 0.07794193923473358, "kl": 0.35032685846090317, "learning_rate": 7.999779268483335e-06, "loss": -0.0776, "step": 422, "step_time": 12.98950216700905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9938806593418121, "epoch": 0.00846, "frac_reward_zero_std": 0.0, "grad_norm": 0.06784449517726898, "kl": 0.513115007430315, "learning_rate": 7.999778123329377e-06, "loss": -0.0403, "num_tokens": 9950129.0, "reward": 0.43760499358177185, "reward_std": 0.8987090587615967, "rewards/rollout_reward_func/mean": 0.43760499358177185, "rewards/rollout_reward_func/std": 0.8987090587615967, "sampling/importance_sampling_ratio/max": 1.8153173923492432, "sampling/importance_sampling_ratio/mean": 0.7292591333389282, "sampling/importance_sampling_ratio/min": 3.8305716998365824e-07, "sampling/sampling_logp_difference/max": 1.9197497367858887, "sampling/sampling_logp_difference/mean": 0.33767664432525635, "step": 423, "step_time": 25.656740581034683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0008716583251953, "epoch": 0.00848, "grad_norm": 0.06370657682418823, "kl": 0.4849278926849365, "learning_rate": 7.999776975212683e-06, "loss": -0.0407, "step": 424, "step_time": 13.163139407988638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.3125, "completions/mean_terminated_length": 4.518518447875977, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.236597552895546, "epoch": 0.0085, "frac_reward_zero_std": 0.125, "grad_norm": 0.043608639389276505, "kl": 0.5650957562029362, "learning_rate": 7.999775824133253e-06, "loss": -0.0342, "num_tokens": 10002825.0, "reward": 0.28639429807662964, "reward_std": 0.8707245588302612, "rewards/rollout_reward_func/mean": 0.28639429807662964, "rewards/rollout_reward_func/std": 0.8707244396209717, "sampling/importance_sampling_ratio/max": 1.2768869400024414, "sampling/importance_sampling_ratio/mean": 0.7428592443466187, "sampling/importance_sampling_ratio/min": 6.947809015400708e-05, "sampling/sampling_logp_difference/max": 2.373613119125366, "sampling/sampling_logp_difference/mean": 0.24698758125305176, "step": 425, "step_time": 21.13629207201302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2391860485076904, "epoch": 0.00852, "grad_norm": 0.041896071285009384, "kl": 0.5435481518507004, "learning_rate": 7.999774670091091e-06, "loss": -0.0343, "step": 426, "step_time": 11.12040852298378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6657229401171207, "epoch": 0.00854, "frac_reward_zero_std": 0.125, "grad_norm": 0.05128807574510574, "kl": 0.5939699802547693, "learning_rate": 7.999773513086197e-06, "loss": -0.0679, "num_tokens": 10053203.0, "reward": 0.603046715259552, "reward_std": 0.8703611493110657, "rewards/rollout_reward_func/mean": 0.603046715259552, "rewards/rollout_reward_func/std": 0.8703610897064209, "sampling/importance_sampling_ratio/max": 1.3309533596038818, "sampling/importance_sampling_ratio/mean": 0.7556934356689453, "sampling/importance_sampling_ratio/min": 1.7128975571267802e-07, "sampling/sampling_logp_difference/max": 1.851290225982666, "sampling/sampling_logp_difference/mean": 0.28770288825035095, "step": 427, "step_time": 23.968051326985005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6667218543589115, "epoch": 0.00856, "grad_norm": 0.04985074698925018, "kl": 0.583908474072814, "learning_rate": 7.99977235311857e-06, "loss": -0.0679, "step": 428, "step_time": 12.605651937949006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.432275652885437, "epoch": 0.00858, "frac_reward_zero_std": 0.0, "grad_norm": 0.09158522635698318, "kl": 0.2383449710905552, "learning_rate": 7.999771190188214e-06, "loss": -0.0614, "num_tokens": 10113461.0, "reward": 0.5687496066093445, "reward_std": 0.803333580493927, "rewards/rollout_reward_func/mean": 0.5687496066093445, "rewards/rollout_reward_func/std": 0.803333580493927, "sampling/importance_sampling_ratio/max": 1.3509348630905151, "sampling/importance_sampling_ratio/mean": 0.8278583288192749, "sampling/importance_sampling_ratio/min": 2.1150935936020687e-05, "sampling/sampling_logp_difference/max": 2.166149139404297, "sampling/sampling_logp_difference/mean": 0.24841979146003723, "step": 429, "step_time": 24.64524806899135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4406747221946716, "epoch": 0.0086, "grad_norm": 0.09941821545362473, "kl": 0.23244832456111908, "learning_rate": 7.99977002429513e-06, "loss": -0.0616, "step": 430, "step_time": 14.111056222987827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.40625, "completions/mean_terminated_length": 4.8947367668151855, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.331647425889969, "epoch": 0.00862, "frac_reward_zero_std": 0.0, "grad_norm": 0.06396996229887009, "kl": 0.24250778928399086, "learning_rate": 7.999768855439316e-06, "loss": -0.0765, "num_tokens": 10172490.0, "reward": 0.37474632263183594, "reward_std": 0.9096826314926147, "rewards/rollout_reward_func/mean": 0.37474632263183594, "rewards/rollout_reward_func/std": 0.9096826314926147, "sampling/importance_sampling_ratio/max": 1.4686298370361328, "sampling/importance_sampling_ratio/mean": 0.5222853422164917, "sampling/importance_sampling_ratio/min": 2.743468257904169e-06, "sampling/sampling_logp_difference/max": 2.1188817024230957, "sampling/sampling_logp_difference/mean": 0.3701289892196655, "step": 431, "step_time": 26.26516573599656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3338328301906586, "epoch": 0.00864, "grad_norm": 0.06049439311027527, "kl": 0.24488292261958122, "learning_rate": 7.999767683620775e-06, "loss": -0.0766, "step": 432, "step_time": 13.197442504024366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 6.107142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8043862283229828, "epoch": 0.00866, "frac_reward_zero_std": 0.125, "grad_norm": 0.1132597029209137, "kl": 0.3288910277187824, "learning_rate": 7.99976650883951e-06, "loss": -0.0688, "num_tokens": 10224540.0, "reward": 0.181508406996727, "reward_std": 0.8139283657073975, "rewards/rollout_reward_func/mean": 0.181508406996727, "rewards/rollout_reward_func/std": 0.8139283657073975, "sampling/importance_sampling_ratio/max": 2.0161423683166504, "sampling/importance_sampling_ratio/mean": 0.7172231674194336, "sampling/importance_sampling_ratio/min": 3.48978464899119e-05, "sampling/sampling_logp_difference/max": 1.8047900199890137, "sampling/sampling_logp_difference/mean": 0.2701529264450073, "step": 433, "step_time": 25.22588004299905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7993966341018677, "epoch": 0.00868, "grad_norm": 0.11561944335699081, "kl": 0.34343044087290764, "learning_rate": 7.99976533109552e-06, "loss": -0.0687, "step": 434, "step_time": 12.705964468972525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8807033002376556, "epoch": 0.0087, "frac_reward_zero_std": 0.125, "grad_norm": 0.07869573682546616, "kl": 0.6294778250157833, "learning_rate": 7.999764150388807e-06, "loss": -0.0558, "num_tokens": 10275265.0, "reward": 0.14395731687545776, "reward_std": 0.7205622792243958, "rewards/rollout_reward_func/mean": 0.14395731687545776, "rewards/rollout_reward_func/std": 0.7205621600151062, "sampling/importance_sampling_ratio/max": 1.4148255586624146, "sampling/importance_sampling_ratio/mean": 0.8106837272644043, "sampling/importance_sampling_ratio/min": 2.727117589529371e-06, "sampling/sampling_logp_difference/max": 2.425508499145508, "sampling/sampling_logp_difference/mean": 0.33092963695526123, "step": 435, "step_time": 22.11320002100547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8787648379802704, "epoch": 0.00872, "grad_norm": 0.08014131337404251, "kl": 0.6173723414540291, "learning_rate": 7.999762966719372e-06, "loss": -0.0559, "step": 436, "step_time": 11.810310153989121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.296296119689941, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0284523367881775, "epoch": 0.00874, "frac_reward_zero_std": 0.125, "grad_norm": 0.13734711706638336, "kl": 0.35950227081775665, "learning_rate": 7.999761780087215e-06, "loss": -0.0552, "num_tokens": 10325340.0, "reward": 0.6277010440826416, "reward_std": 0.8623602390289307, "rewards/rollout_reward_func/mean": 0.6277010440826416, "rewards/rollout_reward_func/std": 0.8623601794242859, "sampling/importance_sampling_ratio/max": 1.3142749071121216, "sampling/importance_sampling_ratio/mean": 0.7730602025985718, "sampling/importance_sampling_ratio/min": 1.3305682777797756e-08, "sampling/sampling_logp_difference/max": 2.467468023300171, "sampling/sampling_logp_difference/mean": 0.3777674436569214, "step": 437, "step_time": 22.182036831014557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.024652272462845, "epoch": 0.00876, "grad_norm": 0.12263184040784836, "kl": 0.37034647911787033, "learning_rate": 7.99976059049234e-06, "loss": -0.0557, "step": 438, "step_time": 12.04762961799861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.920000076293945, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.996584951877594, "epoch": 0.00878, "frac_reward_zero_std": 0.125, "grad_norm": 0.10378459095954895, "kl": 0.6578706800937653, "learning_rate": 7.999759397934744e-06, "loss": -0.0485, "num_tokens": 10377404.0, "reward": 0.4813714027404785, "reward_std": 0.8697435855865479, "rewards/rollout_reward_func/mean": 0.4813714027404785, "rewards/rollout_reward_func/std": 0.8697435855865479, "sampling/importance_sampling_ratio/max": 1.4356796741485596, "sampling/importance_sampling_ratio/mean": 0.7185664176940918, "sampling/importance_sampling_ratio/min": 2.388711436651647e-06, "sampling/sampling_logp_difference/max": 2.165705680847168, "sampling/sampling_logp_difference/mean": 0.3326863646507263, "step": 439, "step_time": 25.68103651603451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.986630916595459, "epoch": 0.0088, "grad_norm": 0.10762215405702591, "kl": 0.6650097519159317, "learning_rate": 7.999758202414433e-06, "loss": -0.0487, "step": 440, "step_time": 13.213349005032796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2514194548130035, "epoch": 0.00882, "frac_reward_zero_std": 0.0, "grad_norm": 0.0423499159514904, "kl": 0.4838288500905037, "learning_rate": 7.999757003931403e-06, "loss": -0.0716, "num_tokens": 10432520.0, "reward": 0.5924427509307861, "reward_std": 0.8672366142272949, "rewards/rollout_reward_func/mean": 0.5924427509307861, "rewards/rollout_reward_func/std": 0.8672366142272949, "sampling/importance_sampling_ratio/max": 1.824174165725708, "sampling/importance_sampling_ratio/mean": 0.8831590414047241, "sampling/importance_sampling_ratio/min": 6.618741463171318e-05, "sampling/sampling_logp_difference/max": 1.7547520399093628, "sampling/sampling_logp_difference/mean": 0.23185530304908752, "step": 441, "step_time": 23.042346547008492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.240860067307949, "epoch": 0.00884, "grad_norm": 0.04123005270957947, "kl": 0.4810945503413677, "learning_rate": 7.99975580248566e-06, "loss": -0.0717, "step": 442, "step_time": 12.319634643994505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2857031524181366, "epoch": 0.00886, "frac_reward_zero_std": 0.0, "grad_norm": 0.05400654673576355, "kl": 0.1661052331328392, "learning_rate": 7.999754598077204e-06, "loss": -0.0713, "num_tokens": 10481063.0, "reward": 0.3528147339820862, "reward_std": 0.9993940591812134, "rewards/rollout_reward_func/mean": 0.3528147339820862, "rewards/rollout_reward_func/std": 0.9993940591812134, "sampling/importance_sampling_ratio/max": 1.47401762008667, "sampling/importance_sampling_ratio/mean": 0.6941298246383667, "sampling/importance_sampling_ratio/min": 6.759448556437064e-09, "sampling/sampling_logp_difference/max": 2.2110204696655273, "sampling/sampling_logp_difference/mean": 0.3639119267463684, "step": 443, "step_time": 25.46551694200025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2756454795598984, "epoch": 0.00888, "grad_norm": 0.0457555428147316, "kl": 0.1690925993025303, "learning_rate": 7.999753390706035e-06, "loss": -0.0714, "step": 444, "step_time": 12.493754522962263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 5.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4052599966526031, "epoch": 0.0089, "frac_reward_zero_std": 0.0, "grad_norm": 0.06478344649076462, "kl": 0.6934923864901066, "learning_rate": 7.999752180372154e-06, "loss": -0.0682, "num_tokens": 10534454.0, "reward": 0.5603077411651611, "reward_std": 0.9139828681945801, "rewards/rollout_reward_func/mean": 0.5603077411651611, "rewards/rollout_reward_func/std": 0.9139828681945801, "sampling/importance_sampling_ratio/max": 1.571278691291809, "sampling/importance_sampling_ratio/mean": 0.763940155506134, "sampling/importance_sampling_ratio/min": 7.851072587072849e-05, "sampling/sampling_logp_difference/max": 1.7825901508331299, "sampling/sampling_logp_difference/mean": 0.22833839058876038, "step": 445, "step_time": 24.478246763028437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4021514058113098, "epoch": 0.00892, "grad_norm": 0.06600227952003479, "kl": 0.7407906837761402, "learning_rate": 7.999750967075562e-06, "loss": -0.0682, "step": 446, "step_time": 12.647518290992593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.043478488922119, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5590913593769073, "epoch": 0.00894, "frac_reward_zero_std": 0.125, "grad_norm": 0.06328113377094269, "kl": 0.29432056471705437, "learning_rate": 7.99974975081626e-06, "loss": -0.0567, "num_tokens": 10591141.0, "reward": 0.49615174531936646, "reward_std": 0.7988648414611816, "rewards/rollout_reward_func/mean": 0.49615174531936646, "rewards/rollout_reward_func/std": 0.7988647818565369, "sampling/importance_sampling_ratio/max": 1.5379382371902466, "sampling/importance_sampling_ratio/mean": 0.8153661489486694, "sampling/importance_sampling_ratio/min": 3.1935514925862662e-06, "sampling/sampling_logp_difference/max": 2.1849617958068848, "sampling/sampling_logp_difference/mean": 0.252957820892334, "step": 447, "step_time": 24.575830234010937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.558253064751625, "epoch": 0.00896, "grad_norm": 0.06378459930419922, "kl": 0.29492538422346115, "learning_rate": 7.999748531594253e-06, "loss": -0.0569, "step": 448, "step_time": 12.63206366301165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.448276042938232, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2688776478171349, "epoch": 0.00898, "frac_reward_zero_std": 0.125, "grad_norm": 0.16746310889720917, "kl": 0.6205133758485317, "learning_rate": 7.999747309409537e-06, "loss": -0.058, "num_tokens": 10641902.0, "reward": 0.6592237949371338, "reward_std": 0.8575702905654907, "rewards/rollout_reward_func/mean": 0.6592237949371338, "rewards/rollout_reward_func/std": 0.8575702905654907, "sampling/importance_sampling_ratio/max": 1.364074945449829, "sampling/importance_sampling_ratio/mean": 0.8468252420425415, "sampling/importance_sampling_ratio/min": 9.162435890175402e-05, "sampling/sampling_logp_difference/max": 1.9106523990631104, "sampling/sampling_logp_difference/mean": 0.24777063727378845, "step": 449, "step_time": 22.193417145026615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2757907286286354, "epoch": 0.009, "grad_norm": 0.17345985770225525, "kl": 0.5912329517304897, "learning_rate": 7.999746084262117e-06, "loss": -0.0585, "step": 450, "step_time": 11.00779630499892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.576923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7475085258483887, "epoch": 0.00902, "frac_reward_zero_std": 0.0, "grad_norm": 0.1237955316901207, "kl": 0.2025589421391487, "learning_rate": 7.999744856151992e-06, "loss": -0.0757, "num_tokens": 10701096.0, "reward": 0.6151396632194519, "reward_std": 0.9179320931434631, "rewards/rollout_reward_func/mean": 0.6151396632194519, "rewards/rollout_reward_func/std": 0.9179320335388184, "sampling/importance_sampling_ratio/max": 2.1592891216278076, "sampling/importance_sampling_ratio/mean": 0.9376415014266968, "sampling/importance_sampling_ratio/min": 0.0002792473533190787, "sampling/sampling_logp_difference/max": 1.890535831451416, "sampling/sampling_logp_difference/mean": 0.25515609979629517, "step": 451, "step_time": 24.78137747303117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7602026164531708, "epoch": 0.00904, "grad_norm": 0.13662958145141602, "kl": 0.2017062082886696, "learning_rate": 7.999743625079163e-06, "loss": -0.0764, "step": 452, "step_time": 13.053899832011666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4475955963134766, "epoch": 0.00906, "frac_reward_zero_std": 0.25, "grad_norm": 0.28664734959602356, "kl": 0.22972562350332737, "learning_rate": 7.999742391043633e-06, "loss": -0.0458, "num_tokens": 10753340.0, "reward": 0.5287416577339172, "reward_std": 0.9884362816810608, "rewards/rollout_reward_func/mean": 0.5287416577339172, "rewards/rollout_reward_func/std": 0.988436222076416, "sampling/importance_sampling_ratio/max": 1.6087347269058228, "sampling/importance_sampling_ratio/mean": 0.8246411681175232, "sampling/importance_sampling_ratio/min": 0.00045266406959854066, "sampling/sampling_logp_difference/max": 2.26257586479187, "sampling/sampling_logp_difference/mean": 0.22356243431568146, "step": 453, "step_time": 28.574605214002077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.460473194718361, "epoch": 0.00908, "grad_norm": 0.05436444655060768, "kl": 0.2277243211865425, "learning_rate": 7.999741154045404e-06, "loss": -0.0464, "step": 454, "step_time": 14.79954183800146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6799510717391968, "epoch": 0.0091, "frac_reward_zero_std": 0.125, "grad_norm": 0.09912983328104019, "kl": 1.1695812493562698, "learning_rate": 7.999739914084476e-06, "loss": -0.0686, "num_tokens": 10807935.0, "reward": 0.8152505159378052, "reward_std": 0.7662681341171265, "rewards/rollout_reward_func/mean": 0.8152505159378052, "rewards/rollout_reward_func/std": 0.7662680149078369, "sampling/importance_sampling_ratio/max": 2.6845762729644775, "sampling/importance_sampling_ratio/mean": 0.8986142873764038, "sampling/importance_sampling_ratio/min": 3.0445529773714952e-05, "sampling/sampling_logp_difference/max": 2.200533866882324, "sampling/sampling_logp_difference/mean": 0.3012310862541199, "step": 455, "step_time": 24.703698599972995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6828879714012146, "epoch": 0.00912, "grad_norm": 0.09537768363952637, "kl": 1.0083645693957806, "learning_rate": 7.999738671160847e-06, "loss": -0.0687, "step": 456, "step_time": 13.023524552001618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.319999694824219, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9726762175559998, "epoch": 0.00914, "frac_reward_zero_std": 0.125, "grad_norm": 0.1135082095861435, "kl": 0.9778477437794209, "learning_rate": 7.999737425274523e-06, "loss": -0.0508, "num_tokens": 10862375.0, "reward": 0.25678712129592896, "reward_std": 0.8866508603096008, "rewards/rollout_reward_func/mean": 0.25678712129592896, "rewards/rollout_reward_func/std": 0.8866508603096008, "sampling/importance_sampling_ratio/max": 1.5245640277862549, "sampling/importance_sampling_ratio/mean": 0.6281532645225525, "sampling/importance_sampling_ratio/min": 0.000177479611011222, "sampling/sampling_logp_difference/max": 2.5483298301696777, "sampling/sampling_logp_difference/mean": 0.28668180108070374, "step": 457, "step_time": 31.96379564498784 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.970322072505951, "epoch": 0.00916, "grad_norm": 0.104444719851017, "kl": 0.8642452880740166, "learning_rate": 7.999736176425502e-06, "loss": -0.0509, "step": 458, "step_time": 15.904057608975563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7132128477096558, "epoch": 0.00918, "frac_reward_zero_std": 0.125, "grad_norm": 0.1075049415230751, "kl": 0.37188928574323654, "learning_rate": 7.999734924613788e-06, "loss": -0.0585, "num_tokens": 10911361.0, "reward": 0.7948278188705444, "reward_std": 0.805321216583252, "rewards/rollout_reward_func/mean": 0.7948278188705444, "rewards/rollout_reward_func/std": 0.805321216583252, "sampling/importance_sampling_ratio/max": 1.6854609251022339, "sampling/importance_sampling_ratio/mean": 0.8449202179908752, "sampling/importance_sampling_ratio/min": 1.988924850593321e-06, "sampling/sampling_logp_difference/max": 2.3592171669006348, "sampling/sampling_logp_difference/mean": 0.33711886405944824, "step": 459, "step_time": 23.25297958700685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7181018590927124, "epoch": 0.0092, "grad_norm": 0.1034657284617424, "kl": 0.3790584057569504, "learning_rate": 7.99973366983938e-06, "loss": -0.0592, "step": 460, "step_time": 12.828622331056977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.65625, "completions/mean_terminated_length": 5.782608985900879, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.169342577457428, "epoch": 0.00922, "frac_reward_zero_std": 0.125, "grad_norm": 0.04064909368753433, "kl": 0.4703243747353554, "learning_rate": 7.99973241210228e-06, "loss": -0.0644, "num_tokens": 10968253.0, "reward": 0.4715154767036438, "reward_std": 0.8544443249702454, "rewards/rollout_reward_func/mean": 0.4715154767036438, "rewards/rollout_reward_func/std": 0.8544443249702454, "sampling/importance_sampling_ratio/max": 1.4486892223358154, "sampling/importance_sampling_ratio/mean": 0.5864831805229187, "sampling/importance_sampling_ratio/min": 5.010761015000753e-07, "sampling/sampling_logp_difference/max": 1.945239543914795, "sampling/sampling_logp_difference/mean": 0.32647213339805603, "step": 461, "step_time": 26.968863111978862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.166378140449524, "epoch": 0.00924, "grad_norm": 0.03934725001454353, "kl": 0.46305543184280396, "learning_rate": 7.99973115140249e-06, "loss": -0.0645, "step": 462, "step_time": 13.071922568022273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 5.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.292215347290039, "epoch": 0.00926, "frac_reward_zero_std": 0.125, "grad_norm": 0.03295091539621353, "kl": 0.5600125342607498, "learning_rate": 7.999729887740009e-06, "loss": -0.0732, "num_tokens": 11016015.0, "reward": 0.5397815704345703, "reward_std": 0.9605303406715393, "rewards/rollout_reward_func/mean": 0.5397815704345703, "rewards/rollout_reward_func/std": 0.9605303406715393, "sampling/importance_sampling_ratio/max": 1.4121302366256714, "sampling/importance_sampling_ratio/mean": 0.6480295062065125, "sampling/importance_sampling_ratio/min": 3.3250631759074167e-07, "sampling/sampling_logp_difference/max": 1.9779572486877441, "sampling/sampling_logp_difference/mean": 0.4076273441314697, "step": 463, "step_time": 23.518162843975006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2999092638492584, "epoch": 0.00928, "grad_norm": 0.03245444595813751, "kl": 0.5617373771965504, "learning_rate": 7.999728621114841e-06, "loss": -0.0733, "step": 464, "step_time": 11.92317982102395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 5.079999923706055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3971155285835266, "epoch": 0.0093, "frac_reward_zero_std": 0.375, "grad_norm": 0.05883027985692024, "kl": 0.213636115193367, "learning_rate": 7.999727351526984e-06, "loss": -0.0444, "num_tokens": 11067181.0, "reward": 0.4950651228427887, "reward_std": 0.9341306686401367, "rewards/rollout_reward_func/mean": 0.4950651228427887, "rewards/rollout_reward_func/std": 0.9341306090354919, "sampling/importance_sampling_ratio/max": 1.3770270347595215, "sampling/importance_sampling_ratio/mean": 0.6621330976486206, "sampling/importance_sampling_ratio/min": 3.453997621249982e-08, "sampling/sampling_logp_difference/max": 2.1521167755126953, "sampling/sampling_logp_difference/mean": 0.40678706765174866, "step": 465, "step_time": 24.35969702800503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3986937403678894, "epoch": 0.00932, "grad_norm": 0.05836404860019684, "kl": 0.21368126198649406, "learning_rate": 7.999726078976442e-06, "loss": -0.0445, "step": 466, "step_time": 12.071233638038393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 4.099999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9885086417198181, "epoch": 0.00934, "frac_reward_zero_std": 0.125, "grad_norm": 0.09778470546007156, "kl": 0.16430404037237167, "learning_rate": 7.999724803463217e-06, "loss": -0.1038, "num_tokens": 11123226.0, "reward": 0.20234373211860657, "reward_std": 0.9363327622413635, "rewards/rollout_reward_func/mean": 0.20234373211860657, "rewards/rollout_reward_func/std": 0.9363327026367188, "sampling/importance_sampling_ratio/max": 1.7808245420455933, "sampling/importance_sampling_ratio/mean": 0.6760156154632568, "sampling/importance_sampling_ratio/min": 8.104209700832143e-05, "sampling/sampling_logp_difference/max": 1.984206199645996, "sampling/sampling_logp_difference/mean": 0.3170372247695923, "step": 467, "step_time": 31.0805853040074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9908936321735382, "epoch": 0.00936, "grad_norm": 0.08866715431213379, "kl": 0.16746091190725565, "learning_rate": 7.99972352498731e-06, "loss": -0.1043, "step": 468, "step_time": 15.204198036983144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6652364134788513, "epoch": 0.00938, "frac_reward_zero_std": 0.125, "grad_norm": 0.11965721100568771, "kl": 0.9052584245800972, "learning_rate": 7.999722243548717e-06, "loss": -0.0693, "num_tokens": 11172054.0, "reward": 0.8229295015335083, "reward_std": 0.8624536395072937, "rewards/rollout_reward_func/mean": 0.8229295015335083, "rewards/rollout_reward_func/std": 0.8624535799026489, "sampling/importance_sampling_ratio/max": 1.6886042356491089, "sampling/importance_sampling_ratio/mean": 0.8397489786148071, "sampling/importance_sampling_ratio/min": 5.463871133315479e-09, "sampling/sampling_logp_difference/max": 1.9018070697784424, "sampling/sampling_logp_difference/mean": 0.30394428968429565, "step": 469, "step_time": 25.81856587700895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0096726194024086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0096726194024086, "entropy": 1.6591995060443878, "epoch": 0.0094, "grad_norm": 0.10740377008914948, "kl": 0.8807784169912338, "learning_rate": 7.999720959147445e-06, "loss": -0.07, "step": 470, "step_time": 14.067836897011148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.90625, "completions/mean_terminated_length": 5.639999866485596, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.832933634519577, "epoch": 0.00942, "frac_reward_zero_std": 0.0, "grad_norm": 0.050130024552345276, "kl": 0.4304448515176773, "learning_rate": 7.999719671783495e-06, "loss": -0.086, "num_tokens": 11228176.0, "reward": 0.36768561601638794, "reward_std": 0.879940927028656, "rewards/rollout_reward_func/mean": 0.36768561601638794, "rewards/rollout_reward_func/std": 0.8799408674240112, "sampling/importance_sampling_ratio/max": 1.5547008514404297, "sampling/importance_sampling_ratio/mean": 0.6577726602554321, "sampling/importance_sampling_ratio/min": 1.0223156095889863e-05, "sampling/sampling_logp_difference/max": 2.3461248874664307, "sampling/sampling_logp_difference/mean": 0.31086063385009766, "step": 471, "step_time": 26.49706220702501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024509804788976908, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024509804788976908, "entropy": 1.826078623533249, "epoch": 0.00944, "grad_norm": 0.047749679535627365, "kl": 0.47343654930591583, "learning_rate": 7.999718381456866e-06, "loss": -0.086, "step": 472, "step_time": 13.395612046006136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5, "completions/mean_terminated_length": 5.5652174949646, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0543091893196106, "epoch": 0.00946, "frac_reward_zero_std": 0.0, "grad_norm": 0.07610951364040375, "kl": 0.4514426030218601, "learning_rate": 7.99971708816756e-06, "loss": -0.0877, "num_tokens": 11282914.0, "reward": 0.40609920024871826, "reward_std": 1.0272263288497925, "rewards/rollout_reward_func/mean": 0.40609920024871826, "rewards/rollout_reward_func/std": 1.0272263288497925, "sampling/importance_sampling_ratio/max": 1.4613779783248901, "sampling/importance_sampling_ratio/mean": 0.5610795021057129, "sampling/importance_sampling_ratio/min": 0.00020108273020014167, "sampling/sampling_logp_difference/max": 1.8527545928955078, "sampling/sampling_logp_difference/mean": 0.28106456995010376, "step": 473, "step_time": 26.0119741250237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.060105115175247, "epoch": 0.00948, "grad_norm": 0.07552831619977951, "kl": 0.4363028910011053, "learning_rate": 7.999715791915578e-06, "loss": -0.0879, "step": 474, "step_time": 12.207388870039722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.214285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5931484401226044, "epoch": 0.0095, "frac_reward_zero_std": 0.125, "grad_norm": 0.11044373363256454, "kl": 0.5775267742574215, "learning_rate": 7.999714492700921e-06, "loss": -0.0536, "num_tokens": 11342866.0, "reward": 0.4281522035598755, "reward_std": 0.7930460572242737, "rewards/rollout_reward_func/mean": 0.4281522035598755, "rewards/rollout_reward_func/std": 0.7930459976196289, "sampling/importance_sampling_ratio/max": 2.297135829925537, "sampling/importance_sampling_ratio/mean": 0.817914605140686, "sampling/importance_sampling_ratio/min": 9.656123438617215e-06, "sampling/sampling_logp_difference/max": 2.1112592220306396, "sampling/sampling_logp_difference/mean": 0.2862268388271332, "step": 475, "step_time": 25.90320919197984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5914992094039917, "epoch": 0.00952, "grad_norm": 0.10961706191301346, "kl": 0.5518639907240868, "learning_rate": 7.999713190523593e-06, "loss": -0.0536, "step": 476, "step_time": 13.048670068994397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.6875, "completions/mean_terminated_length": 3.9333336353302, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8044179975986481, "epoch": 0.00954, "frac_reward_zero_std": 0.125, "grad_norm": 0.14046238362789154, "kl": 0.8089933097362518, "learning_rate": 7.999711885383593e-06, "loss": -0.0462, "num_tokens": 11394923.0, "reward": 0.7564988732337952, "reward_std": 0.7877184152603149, "rewards/rollout_reward_func/mean": 0.7564988732337952, "rewards/rollout_reward_func/std": 0.7877184152603149, "sampling/importance_sampling_ratio/max": 1.5942875146865845, "sampling/importance_sampling_ratio/mean": 0.9373886585235596, "sampling/importance_sampling_ratio/min": 0.002174949739128351, "sampling/sampling_logp_difference/max": 1.6726665496826172, "sampling/sampling_logp_difference/mean": 0.16835016012191772, "step": 477, "step_time": 23.21816830796888 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8038435950875282, "epoch": 0.00956, "grad_norm": 0.1392688900232315, "kl": 0.753171518445015, "learning_rate": 7.99971057728092e-06, "loss": -0.0465, "step": 478, "step_time": 12.830173097027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3851932436227798, "epoch": 0.00958, "frac_reward_zero_std": 0.125, "grad_norm": 0.06238321587443352, "kl": 0.3692801557481289, "learning_rate": 7.99970926621558e-06, "loss": -0.0616, "num_tokens": 11446246.0, "reward": 0.6651728749275208, "reward_std": 0.9210944175720215, "rewards/rollout_reward_func/mean": 0.6651728749275208, "rewards/rollout_reward_func/std": 0.9210944175720215, "sampling/importance_sampling_ratio/max": 1.2868702411651611, "sampling/importance_sampling_ratio/mean": 0.7829701900482178, "sampling/importance_sampling_ratio/min": 8.635842095827684e-05, "sampling/sampling_logp_difference/max": 1.8260793685913086, "sampling/sampling_logp_difference/mean": 0.22748109698295593, "step": 479, "step_time": 24.709817062946968 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "entropy": 1.3890264332294464, "epoch": 0.0096, "grad_norm": 0.06400534510612488, "kl": 0.33477237075567245, "learning_rate": 7.999707952187571e-06, "loss": -0.0617, "step": 480, "step_time": 12.551621232007165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6692828834056854, "epoch": 0.00962, "frac_reward_zero_std": 0.375, "grad_norm": 0.05598883703351021, "kl": 0.5112681649625301, "learning_rate": 7.999706635196896e-06, "loss": -0.037, "num_tokens": 11495275.0, "reward": 0.45394235849380493, "reward_std": 0.8241062164306641, "rewards/rollout_reward_func/mean": 0.45394235849380493, "rewards/rollout_reward_func/std": 0.8241061568260193, "sampling/importance_sampling_ratio/max": 1.3484936952590942, "sampling/importance_sampling_ratio/mean": 0.8186933994293213, "sampling/importance_sampling_ratio/min": 4.790018692801823e-07, "sampling/sampling_logp_difference/max": 2.319638729095459, "sampling/sampling_logp_difference/mean": 0.2956033945083618, "step": 481, "step_time": 20.523410163936205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6676370203495026, "epoch": 0.00964, "grad_norm": 0.05073746293783188, "kl": 0.45997415482997894, "learning_rate": 7.999705315243556e-06, "loss": -0.0372, "step": 482, "step_time": 10.43862583799637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.964285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3772654309868813, "epoch": 0.00966, "frac_reward_zero_std": 0.125, "grad_norm": 0.05126955732703209, "kl": 0.38321883231401443, "learning_rate": 7.999703992327552e-06, "loss": -0.0804, "num_tokens": 11540178.0, "reward": 0.830952525138855, "reward_std": 0.9155949950218201, "rewards/rollout_reward_func/mean": 0.830952525138855, "rewards/rollout_reward_func/std": 0.9155949354171753, "sampling/importance_sampling_ratio/max": 1.1803642511367798, "sampling/importance_sampling_ratio/mean": 0.8365136384963989, "sampling/importance_sampling_ratio/min": 7.744793038000353e-06, "sampling/sampling_logp_difference/max": 1.6892807483673096, "sampling/sampling_logp_difference/mean": 0.2735418975353241, "step": 483, "step_time": 22.008286275027785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3765578642487526, "epoch": 0.00968, "grad_norm": 0.051093827933073044, "kl": 0.37295592576265335, "learning_rate": 7.999702666448885e-06, "loss": -0.0805, "step": 484, "step_time": 11.822391162015265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 4.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4815154522657394, "epoch": 0.0097, "frac_reward_zero_std": 0.0, "grad_norm": 0.25742483139038086, "kl": 1.3465082123875618, "learning_rate": 7.999701337607558e-06, "loss": -0.0521, "num_tokens": 11596825.0, "reward": 0.737997829914093, "reward_std": 0.8085649609565735, "rewards/rollout_reward_func/mean": 0.737997829914093, "rewards/rollout_reward_func/std": 0.8085649609565735, "sampling/importance_sampling_ratio/max": 1.4971102476119995, "sampling/importance_sampling_ratio/mean": 0.9123359322547913, "sampling/importance_sampling_ratio/min": 1.8269773818246904e-06, "sampling/sampling_logp_difference/max": 2.0485801696777344, "sampling/sampling_logp_difference/mean": 0.2546086311340332, "step": 485, "step_time": 23.55260494403774 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.4867502748966217, "epoch": 0.00972, "grad_norm": 0.17977148294448853, "kl": 0.929692804813385, "learning_rate": 7.999700005803569e-06, "loss": -0.0539, "step": 486, "step_time": 12.3283929299796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.7457568645477295, "epoch": 0.00974, "frac_reward_zero_std": 0.0, "grad_norm": 0.13545767962932587, "kl": 0.1801897156983614, "learning_rate": 7.999698671036923e-06, "loss": -0.0957, "num_tokens": 11658321.0, "reward": 0.020102284848690033, "reward_std": 0.7311347126960754, "rewards/rollout_reward_func/mean": 0.020102284848690033, "rewards/rollout_reward_func/std": 0.7311347126960754, "sampling/importance_sampling_ratio/max": 1.6420609951019287, "sampling/importance_sampling_ratio/mean": 0.5507954359054565, "sampling/importance_sampling_ratio/min": 1.2715372577076778e-06, "sampling/sampling_logp_difference/max": 1.8350915908813477, "sampling/sampling_logp_difference/mean": 0.3909551799297333, "step": 487, "step_time": 28.898589000978973 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.7565730810165405, "epoch": 0.00976, "grad_norm": 0.15440644323825836, "kl": 0.172921571880579, "learning_rate": 7.999697333307616e-06, "loss": -0.0951, "step": 488, "step_time": 13.101107285008766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.857142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8175806403160095, "epoch": 0.00978, "frac_reward_zero_std": 0.125, "grad_norm": 0.04961870238184929, "kl": 0.4751783609390259, "learning_rate": 7.999695992615658e-06, "loss": -0.0854, "num_tokens": 11704233.0, "reward": 0.7745914459228516, "reward_std": 0.8965354561805725, "rewards/rollout_reward_func/mean": 0.7745914459228516, "rewards/rollout_reward_func/std": 0.8965354561805725, "sampling/importance_sampling_ratio/max": 1.2973275184631348, "sampling/importance_sampling_ratio/mean": 0.7025195360183716, "sampling/importance_sampling_ratio/min": 2.914936203524121e-06, "sampling/sampling_logp_difference/max": 2.0831212997436523, "sampling/sampling_logp_difference/mean": 0.3559083938598633, "step": 489, "step_time": 22.488423713017255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.818581461906433, "epoch": 0.0098, "grad_norm": 0.04909753426909447, "kl": 0.5028202757239342, "learning_rate": 7.999694648961041e-06, "loss": -0.0854, "step": 490, "step_time": 11.350012916023843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.5625, "completions/mean_terminated_length": 5.65217399597168, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9034771621227264, "epoch": 0.00982, "frac_reward_zero_std": 0.25, "grad_norm": 0.15189778804779053, "kl": 0.39213256910443306, "learning_rate": 7.999693302343772e-06, "loss": -0.038, "num_tokens": 11758371.0, "reward": 0.20985586941242218, "reward_std": 0.911850094795227, "rewards/rollout_reward_func/mean": 0.20985586941242218, "rewards/rollout_reward_func/std": 0.911850094795227, "sampling/importance_sampling_ratio/max": 1.3226752281188965, "sampling/importance_sampling_ratio/mean": 0.5197591781616211, "sampling/importance_sampling_ratio/min": 2.720629890973214e-05, "sampling/sampling_logp_difference/max": 1.8557302951812744, "sampling/sampling_logp_difference/mean": 0.32832491397857666, "step": 491, "step_time": 27.245846400008304 }, { "clip_ratio/high_max": 0.004545454401522875, "clip_ratio/high_mean": 0.0022727272007614374, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022727272007614374, "entropy": 1.9007720947265625, "epoch": 0.00984, "grad_norm": 0.13291007280349731, "kl": 0.39498909562826157, "learning_rate": 7.999691952763851e-06, "loss": -0.0388, "step": 492, "step_time": 12.725082424032735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.629629611968994, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3179179057478905, "epoch": 0.00986, "frac_reward_zero_std": 0.25, "grad_norm": 0.06816055625677109, "kl": 0.23638468608260155, "learning_rate": 7.99969060022128e-06, "loss": -0.0635, "num_tokens": 11810508.0, "reward": 0.3718250095844269, "reward_std": 0.7873470187187195, "rewards/rollout_reward_func/mean": 0.3718250095844269, "rewards/rollout_reward_func/std": 0.7873470187187195, "sampling/importance_sampling_ratio/max": 1.627014398574829, "sampling/importance_sampling_ratio/mean": 0.9362068176269531, "sampling/importance_sampling_ratio/min": 1.1320448720653076e-05, "sampling/sampling_logp_difference/max": 2.1811203956604004, "sampling/sampling_logp_difference/mean": 0.22120876610279083, "step": 493, "step_time": 24.863352419954026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3136205114424229, "epoch": 0.00988, "grad_norm": 0.06930945813655853, "kl": 0.23734459280967712, "learning_rate": 7.999689244716059e-06, "loss": -0.0635, "step": 494, "step_time": 13.170727659016848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 3.9642858505249023, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1347360871732235, "epoch": 0.0099, "frac_reward_zero_std": 0.375, "grad_norm": 0.11061327904462814, "kl": 0.363203264772892, "learning_rate": 7.99968788624819e-06, "loss": -0.0554, "num_tokens": 11858720.0, "reward": 0.9676085114479065, "reward_std": 0.7554826140403748, "rewards/rollout_reward_func/mean": 0.9676085114479065, "rewards/rollout_reward_func/std": 0.7554826736450195, "sampling/importance_sampling_ratio/max": 1.313605785369873, "sampling/importance_sampling_ratio/mean": 0.9177572727203369, "sampling/importance_sampling_ratio/min": 3.682906481117243e-06, "sampling/sampling_logp_difference/max": 1.6596639156341553, "sampling/sampling_logp_difference/mean": 0.24352729320526123, "step": 495, "step_time": 22.308414951025043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1316916719079018, "epoch": 0.00992, "grad_norm": 0.10821837186813354, "kl": 0.3833450824022293, "learning_rate": 7.999686524817674e-06, "loss": -0.0555, "step": 496, "step_time": 11.89167488599196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8623497597873211, "epoch": 0.00994, "frac_reward_zero_std": 0.25, "grad_norm": 0.11152645200490952, "kl": 0.4168413281440735, "learning_rate": 7.999685160424513e-06, "loss": -0.0492, "num_tokens": 11904376.0, "reward": 1.0607961416244507, "reward_std": 0.7386355400085449, "rewards/rollout_reward_func/mean": 1.0607961416244507, "rewards/rollout_reward_func/std": 0.7386355996131897, "sampling/importance_sampling_ratio/max": 1.4500727653503418, "sampling/importance_sampling_ratio/mean": 0.9457359910011292, "sampling/importance_sampling_ratio/min": 0.0010710394708439708, "sampling/sampling_logp_difference/max": 1.3512353897094727, "sampling/sampling_logp_difference/mean": 0.15285316109657288, "step": 497, "step_time": 23.157147366029676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8583409748971462, "epoch": 0.00996, "grad_norm": 0.10898127406835556, "kl": 0.43120604380965233, "learning_rate": 7.999683793068708e-06, "loss": -0.0496, "step": 498, "step_time": 12.407052421971457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.46875, "completions/mean_terminated_length": 4.096774101257324, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5511324852705002, "epoch": 0.00998, "frac_reward_zero_std": 0.375, "grad_norm": 0.03535659611225128, "kl": 0.6628879606723785, "learning_rate": 7.99968242275026e-06, "loss": -0.0312, "num_tokens": 11952853.0, "reward": 0.7375078201293945, "reward_std": 0.8341090679168701, "rewards/rollout_reward_func/mean": 0.7375078201293945, "rewards/rollout_reward_func/std": 0.8341090083122253, "sampling/importance_sampling_ratio/max": 1.3908157348632812, "sampling/importance_sampling_ratio/mean": 0.9917770624160767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5898021459579468, "sampling/sampling_logp_difference/mean": 0.13679543137550354, "step": 499, "step_time": 21.761748739052564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5472846925258636, "epoch": 0.01, "grad_norm": 0.039497945457696915, "kl": 0.6840823143720627, "learning_rate": 7.999681049469169e-06, "loss": -0.0311, "step": 500, "step_time": 12.023048904025927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1358620524406433, "epoch": 0.01002, "frac_reward_zero_std": 0.25, "grad_norm": 0.07357535511255264, "kl": 0.4568551331758499, "learning_rate": 7.99967967322544e-06, "loss": -0.0444, "num_tokens": 12001609.0, "reward": 0.6088943481445312, "reward_std": 0.8459611535072327, "rewards/rollout_reward_func/mean": 0.6088943481445312, "rewards/rollout_reward_func/std": 0.8459612131118774, "sampling/importance_sampling_ratio/max": 1.2182048559188843, "sampling/importance_sampling_ratio/mean": 0.8449312448501587, "sampling/importance_sampling_ratio/min": 2.406788735243026e-05, "sampling/sampling_logp_difference/max": 1.7754300832748413, "sampling/sampling_logp_difference/mean": 0.19523444771766663, "step": 501, "step_time": 23.249773012008518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1371465772390366, "epoch": 0.01004, "grad_norm": 0.07168518006801605, "kl": 0.4678947515785694, "learning_rate": 7.999678294019072e-06, "loss": -0.0446, "step": 502, "step_time": 12.148118567012716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.896551609039307, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2074107080698013, "epoch": 0.01006, "frac_reward_zero_std": 0.25, "grad_norm": 0.10057388991117477, "kl": 1.0103638470172882, "learning_rate": 7.999676911850067e-06, "loss": -0.0646, "num_tokens": 12052022.0, "reward": 0.6499794721603394, "reward_std": 0.8434142470359802, "rewards/rollout_reward_func/mean": 0.6499794721603394, "rewards/rollout_reward_func/std": 0.8434142470359802, "sampling/importance_sampling_ratio/max": 1.4971476793289185, "sampling/importance_sampling_ratio/mean": 0.8429380655288696, "sampling/importance_sampling_ratio/min": 1.205331409437349e-05, "sampling/sampling_logp_difference/max": 2.419422149658203, "sampling/sampling_logp_difference/mean": 0.27467918395996094, "step": 503, "step_time": 22.853645069000777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2075629066675901, "epoch": 0.01008, "grad_norm": 0.09353737533092499, "kl": 0.9684667475521564, "learning_rate": 7.999675526718425e-06, "loss": -0.0648, "step": 504, "step_time": 12.087609537004028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 5.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1841551288962364, "epoch": 0.0101, "frac_reward_zero_std": 0.125, "grad_norm": 0.10067061334848404, "kl": 0.26964618638157845, "learning_rate": 7.99967413862415e-06, "loss": -0.0508, "num_tokens": 12102880.0, "reward": 0.19742614030838013, "reward_std": 0.816693902015686, "rewards/rollout_reward_func/mean": 0.19742614030838013, "rewards/rollout_reward_func/std": 0.8166938424110413, "sampling/importance_sampling_ratio/max": 1.3767738342285156, "sampling/importance_sampling_ratio/mean": 0.9016183614730835, "sampling/importance_sampling_ratio/min": 1.1919308917640592e-07, "sampling/sampling_logp_difference/max": 2.1336631774902344, "sampling/sampling_logp_difference/mean": 0.2628144919872284, "step": 505, "step_time": 22.63665503199445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1848979238420725, "epoch": 0.01012, "grad_norm": 0.09499858319759369, "kl": 0.27002413012087345, "learning_rate": 7.99967274756724e-06, "loss": -0.0508, "step": 506, "step_time": 12.187862159014912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.415862262248993, "epoch": 0.01014, "frac_reward_zero_std": 0.25, "grad_norm": 0.07372879236936569, "kl": 0.4330042749643326, "learning_rate": 7.999671353547698e-06, "loss": -0.0471, "num_tokens": 12162440.0, "reward": 0.7895362973213196, "reward_std": 0.7732475996017456, "rewards/rollout_reward_func/mean": 0.7895362973213196, "rewards/rollout_reward_func/std": 0.7732475996017456, "sampling/importance_sampling_ratio/max": 1.4258949756622314, "sampling/importance_sampling_ratio/mean": 0.8341109752655029, "sampling/importance_sampling_ratio/min": 3.177839971613139e-05, "sampling/sampling_logp_difference/max": 1.7154444456100464, "sampling/sampling_logp_difference/mean": 0.2356930673122406, "step": 507, "step_time": 27.05948757796432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4188081920146942, "epoch": 0.01016, "grad_norm": 0.08056343346834183, "kl": 0.4066563993692398, "learning_rate": 7.999669956565526e-06, "loss": -0.0471, "step": 508, "step_time": 13.823660838999785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7194783985614777, "epoch": 0.01018, "frac_reward_zero_std": 0.125, "grad_norm": 0.05798092857003212, "kl": 0.6202885657548904, "learning_rate": 7.999668556620727e-06, "loss": -0.0615, "num_tokens": 12210861.0, "reward": 0.7631380558013916, "reward_std": 0.8520112037658691, "rewards/rollout_reward_func/mean": 0.7631380558013916, "rewards/rollout_reward_func/std": 0.8520112633705139, "sampling/importance_sampling_ratio/max": 1.9279226064682007, "sampling/importance_sampling_ratio/mean": 0.740745484828949, "sampling/importance_sampling_ratio/min": 1.2762795449816622e-05, "sampling/sampling_logp_difference/max": 1.9512956142425537, "sampling/sampling_logp_difference/mean": 0.2906308174133301, "step": 509, "step_time": 21.691736742999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7222045958042145, "epoch": 0.0102, "grad_norm": 0.05565227195620537, "kl": 0.6101917997002602, "learning_rate": 7.999667153713299e-06, "loss": -0.0615, "step": 510, "step_time": 11.147388737968868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.115384578704834, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6547159254550934, "epoch": 0.01022, "frac_reward_zero_std": 0.25, "grad_norm": 0.07018028944730759, "kl": 0.485085628926754, "learning_rate": 7.999665747843243e-06, "loss": -0.031, "num_tokens": 12261556.0, "reward": 0.8064939975738525, "reward_std": 0.8554605841636658, "rewards/rollout_reward_func/mean": 0.8064939975738525, "rewards/rollout_reward_func/std": 0.855460524559021, "sampling/importance_sampling_ratio/max": 1.381920576095581, "sampling/importance_sampling_ratio/mean": 0.8025671243667603, "sampling/importance_sampling_ratio/min": 2.285980116312203e-07, "sampling/sampling_logp_difference/max": 2.416764736175537, "sampling/sampling_logp_difference/mean": 0.3206058740615845, "step": 511, "step_time": 24.777157715987414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6576089262962341, "epoch": 0.01024, "grad_norm": 0.06889304518699646, "kl": 0.4790465086698532, "learning_rate": 7.999664339010564e-06, "loss": -0.0311, "step": 512, "step_time": 13.133122929983074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3025445491075516, "epoch": 0.01026, "frac_reward_zero_std": 0.5, "grad_norm": 0.03181324154138565, "kl": 0.31590207293629646, "learning_rate": 7.999662927215261e-06, "loss": -0.0313, "num_tokens": 12306570.0, "reward": 0.9668885469436646, "reward_std": 0.8668286800384521, "rewards/rollout_reward_func/mean": 0.9668885469436646, "rewards/rollout_reward_func/std": 0.8668286800384521, "sampling/importance_sampling_ratio/max": 1.2141458988189697, "sampling/importance_sampling_ratio/mean": 0.9055132865905762, "sampling/importance_sampling_ratio/min": 1.6297163938361336e-06, "sampling/sampling_logp_difference/max": 1.7982279062271118, "sampling/sampling_logp_difference/mean": 0.19765059649944305, "step": 513, "step_time": 21.740467741998145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3060548901557922, "epoch": 0.01028, "grad_norm": 0.033194415271282196, "kl": 0.3115319088101387, "learning_rate": 7.999661512457336e-06, "loss": -0.0312, "step": 514, "step_time": 11.81125520702335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9650545418262482, "epoch": 0.0103, "frac_reward_zero_std": 0.125, "grad_norm": 0.12937907874584198, "kl": 0.33032790943980217, "learning_rate": 7.999660094736791e-06, "loss": -0.0829, "num_tokens": 12356189.0, "reward": 0.7661001086235046, "reward_std": 0.8560479283332825, "rewards/rollout_reward_func/mean": 0.7661001086235046, "rewards/rollout_reward_func/std": 0.8560478687286377, "sampling/importance_sampling_ratio/max": 1.2754267454147339, "sampling/importance_sampling_ratio/mean": 0.719113826751709, "sampling/importance_sampling_ratio/min": 4.350260041974252e-06, "sampling/sampling_logp_difference/max": 1.7722949981689453, "sampling/sampling_logp_difference/mean": 0.32577911019325256, "step": 515, "step_time": 30.07016871799715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.971908688545227, "epoch": 0.01032, "grad_norm": 0.12825949490070343, "kl": 0.3413505293428898, "learning_rate": 7.999658674053624e-06, "loss": -0.0829, "step": 516, "step_time": 15.428451818006579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1729345619678497, "epoch": 0.01034, "frac_reward_zero_std": 0.125, "grad_norm": 0.07782088220119476, "kl": 0.8248338624835014, "learning_rate": 7.999657250407843e-06, "loss": -0.0619, "num_tokens": 12415082.0, "reward": 0.5244098901748657, "reward_std": 0.8541940450668335, "rewards/rollout_reward_func/mean": 0.5244098901748657, "rewards/rollout_reward_func/std": 0.8541941046714783, "sampling/importance_sampling_ratio/max": 1.2053096294403076, "sampling/importance_sampling_ratio/mean": 0.6456502676010132, "sampling/importance_sampling_ratio/min": 1.0748488676881607e-07, "sampling/sampling_logp_difference/max": 2.5312442779541016, "sampling/sampling_logp_difference/mean": 0.39209455251693726, "step": 517, "step_time": 27.187270703958347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1752613186836243, "epoch": 0.01036, "grad_norm": 0.0781283974647522, "kl": 0.8207434117794037, "learning_rate": 7.999655823799443e-06, "loss": -0.0619, "step": 518, "step_time": 13.196669856988592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1106926500797272, "epoch": 0.01038, "frac_reward_zero_std": 0.125, "grad_norm": 0.060736749321222305, "kl": 0.2970254309475422, "learning_rate": 7.999654394228429e-06, "loss": -0.0614, "num_tokens": 12466548.0, "reward": 0.3086604177951813, "reward_std": 0.8257142305374146, "rewards/rollout_reward_func/mean": 0.3086604177951813, "rewards/rollout_reward_func/std": 0.8257141709327698, "sampling/importance_sampling_ratio/max": 1.407558560371399, "sampling/importance_sampling_ratio/mean": 0.9498922228813171, "sampling/importance_sampling_ratio/min": 2.971311232613516e-06, "sampling/sampling_logp_difference/max": 1.920017957687378, "sampling/sampling_logp_difference/mean": 0.2060813307762146, "step": 519, "step_time": 22.049627283005975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1136351376771927, "epoch": 0.0104, "grad_norm": 0.05223992466926575, "kl": 0.3016367107629776, "learning_rate": 7.9996529616948e-06, "loss": -0.0613, "step": 520, "step_time": 11.317721219005762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5177426859736443, "epoch": 0.01042, "frac_reward_zero_std": 0.125, "grad_norm": 0.11943536251783371, "kl": 0.6257241852581501, "learning_rate": 7.99965152619856e-06, "loss": -0.0629, "num_tokens": 12518448.0, "reward": 0.4477572739124298, "reward_std": 0.8811671733856201, "rewards/rollout_reward_func/mean": 0.4477572739124298, "rewards/rollout_reward_func/std": 0.8811672329902649, "sampling/importance_sampling_ratio/max": 1.3152011632919312, "sampling/importance_sampling_ratio/mean": 0.8805853724479675, "sampling/importance_sampling_ratio/min": 7.11731763658463e-06, "sampling/sampling_logp_difference/max": 1.657319188117981, "sampling/sampling_logp_difference/mean": 0.24682685732841492, "step": 521, "step_time": 24.097181088029174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5181595608592033, "epoch": 0.01044, "grad_norm": 0.12023449689149857, "kl": 0.6012099161744118, "learning_rate": 7.999650087739709e-06, "loss": -0.0632, "step": 522, "step_time": 12.466102414007764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9786033034324646, "epoch": 0.01046, "frac_reward_zero_std": 0.25, "grad_norm": 0.0697411298751831, "kl": 0.5801813453435898, "learning_rate": 7.999648646318248e-06, "loss": -0.0596, "num_tokens": 12570273.0, "reward": 0.2638906240463257, "reward_std": 0.7513746619224548, "rewards/rollout_reward_func/mean": 0.2638906240463257, "rewards/rollout_reward_func/std": 0.7513746619224548, "sampling/importance_sampling_ratio/max": 1.4409129619598389, "sampling/importance_sampling_ratio/mean": 0.7867530584335327, "sampling/importance_sampling_ratio/min": 2.2943506792216795e-06, "sampling/sampling_logp_difference/max": 1.7500898838043213, "sampling/sampling_logp_difference/mean": 0.308070570230484, "step": 523, "step_time": 24.73171374501544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9911774694919586, "epoch": 0.01048, "grad_norm": 0.06303861737251282, "kl": 0.542410708963871, "learning_rate": 7.999647201934178e-06, "loss": -0.0599, "step": 524, "step_time": 12.457847340003354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.199999809265137, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.565779447555542, "epoch": 0.0105, "frac_reward_zero_std": 0.125, "grad_norm": 0.06306900084018707, "kl": 0.30976220965385437, "learning_rate": 7.999645754587504e-06, "loss": -0.0563, "num_tokens": 12617842.0, "reward": 0.30855661630630493, "reward_std": 0.9055998921394348, "rewards/rollout_reward_func/mean": 0.30855661630630493, "rewards/rollout_reward_func/std": 0.9055998921394348, "sampling/importance_sampling_ratio/max": 1.3929787874221802, "sampling/importance_sampling_ratio/mean": 0.8466913104057312, "sampling/importance_sampling_ratio/min": 2.855396701306745e-07, "sampling/sampling_logp_difference/max": 1.876957893371582, "sampling/sampling_logp_difference/mean": 0.2560387849807739, "step": 525, "step_time": 27.61353646099451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.565647929906845, "epoch": 0.01052, "grad_norm": 0.06262112408876419, "kl": 0.3235234208405018, "learning_rate": 7.999644304278223e-06, "loss": -0.0565, "step": 526, "step_time": 14.126206508954056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5577512085437775, "epoch": 0.01054, "frac_reward_zero_std": 0.25, "grad_norm": 0.03911898657679558, "kl": 0.3035438731312752, "learning_rate": 7.999642851006338e-06, "loss": -0.0716, "num_tokens": 12668134.0, "reward": 0.6600576639175415, "reward_std": 0.9244891405105591, "rewards/rollout_reward_func/mean": 0.6600576639175415, "rewards/rollout_reward_func/std": 0.9244891405105591, "sampling/importance_sampling_ratio/max": 1.2846589088439941, "sampling/importance_sampling_ratio/mean": 0.8502278923988342, "sampling/importance_sampling_ratio/min": 4.302401919176191e-07, "sampling/sampling_logp_difference/max": 2.0346007347106934, "sampling/sampling_logp_difference/mean": 0.2836346924304962, "step": 527, "step_time": 23.663607598020462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5569210052490234, "epoch": 0.01056, "grad_norm": 0.040436550974845886, "kl": 0.2940635532140732, "learning_rate": 7.999641394771852e-06, "loss": -0.0716, "step": 528, "step_time": 12.63024720098474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.892857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7531498670578003, "epoch": 0.01058, "frac_reward_zero_std": 0.0, "grad_norm": 0.06231966242194176, "kl": 0.5559861361980438, "learning_rate": 7.999639935574764e-06, "loss": -0.0612, "num_tokens": 12718806.0, "reward": 0.45241260528564453, "reward_std": 0.8516740798950195, "rewards/rollout_reward_func/mean": 0.45241260528564453, "rewards/rollout_reward_func/std": 0.8516740202903748, "sampling/importance_sampling_ratio/max": 1.5556806325912476, "sampling/importance_sampling_ratio/mean": 0.7871519327163696, "sampling/importance_sampling_ratio/min": 5.31075102117029e-07, "sampling/sampling_logp_difference/max": 2.2798430919647217, "sampling/sampling_logp_difference/mean": 0.3454243540763855, "step": 529, "step_time": 22.97038230599719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7512745559215546, "epoch": 0.0106, "grad_norm": 0.06235784292221069, "kl": 0.5870553180575371, "learning_rate": 7.999638473415077e-06, "loss": -0.0612, "step": 530, "step_time": 12.190719896025257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.875, "completions/mean_terminated_length": 4.769230842590332, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6381565257906914, "epoch": 0.01062, "frac_reward_zero_std": 0.0, "grad_norm": 0.05174154415726662, "kl": 0.7453885450959206, "learning_rate": 7.999637008292793e-06, "loss": -0.1008, "num_tokens": 12775621.0, "reward": 0.807553768157959, "reward_std": 0.8244325518608093, "rewards/rollout_reward_func/mean": 0.807553768157959, "rewards/rollout_reward_func/std": 0.8244324922561646, "sampling/importance_sampling_ratio/max": 1.4031575918197632, "sampling/importance_sampling_ratio/mean": 0.7292793989181519, "sampling/importance_sampling_ratio/min": 1.972089557966683e-05, "sampling/sampling_logp_difference/max": 1.8991755247116089, "sampling/sampling_logp_difference/mean": 0.296416699886322, "step": 531, "step_time": 26.03444512601709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6373745650053024, "epoch": 0.01064, "grad_norm": 0.04933806136250496, "kl": 0.7325374782085419, "learning_rate": 7.999635540207911e-06, "loss": -0.1009, "step": 532, "step_time": 12.929731406999053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5752965211868286, "epoch": 0.01066, "frac_reward_zero_std": 0.25, "grad_norm": 0.04080164432525635, "kl": 0.572898805141449, "learning_rate": 7.999634069160435e-06, "loss": -0.0474, "num_tokens": 12823977.0, "reward": 0.5956643223762512, "reward_std": 0.8314892649650574, "rewards/rollout_reward_func/mean": 0.5956643223762512, "rewards/rollout_reward_func/std": 0.8314893245697021, "sampling/importance_sampling_ratio/max": 1.2374554872512817, "sampling/importance_sampling_ratio/mean": 0.84393709897995, "sampling/importance_sampling_ratio/min": 1.3703532204090152e-05, "sampling/sampling_logp_difference/max": 1.8723429441452026, "sampling/sampling_logp_difference/mean": 0.2523933947086334, "step": 533, "step_time": 23.38536061596824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.573609247803688, "epoch": 0.01068, "grad_norm": 0.042749278247356415, "kl": 0.5861099995672703, "learning_rate": 7.999632595150365e-06, "loss": -0.0473, "step": 534, "step_time": 12.719529538968345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1314930878579617, "epoch": 0.0107, "frac_reward_zero_std": 0.0, "grad_norm": 0.07255411148071289, "kl": 0.2711847834289074, "learning_rate": 7.999631118177703e-06, "loss": -0.069, "num_tokens": 12878858.0, "reward": 0.7207686901092529, "reward_std": 0.8905079364776611, "rewards/rollout_reward_func/mean": 0.7207686901092529, "rewards/rollout_reward_func/std": 0.8905078768730164, "sampling/importance_sampling_ratio/max": 1.3525795936584473, "sampling/importance_sampling_ratio/mean": 0.9024368524551392, "sampling/importance_sampling_ratio/min": 3.488257505068759e-07, "sampling/sampling_logp_difference/max": 2.0847530364990234, "sampling/sampling_logp_difference/mean": 0.21986263990402222, "step": 535, "step_time": 21.469133139005862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1282103173434734, "epoch": 0.01072, "grad_norm": 0.07254830002784729, "kl": 0.2709924466907978, "learning_rate": 7.999629638242451e-06, "loss": -0.0691, "step": 536, "step_time": 11.081173551036045 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.5416669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7064822763204575, "epoch": 0.01074, "frac_reward_zero_std": 0.25, "grad_norm": 0.156602680683136, "kl": 0.5044611915946007, "learning_rate": 7.999628155344608e-06, "loss": -0.0639, "num_tokens": 12930054.0, "reward": 0.548663854598999, "reward_std": 0.8497306108474731, "rewards/rollout_reward_func/mean": 0.548663854598999, "rewards/rollout_reward_func/std": 0.8497306108474731, "sampling/importance_sampling_ratio/max": 1.3553048372268677, "sampling/importance_sampling_ratio/mean": 0.7714248895645142, "sampling/importance_sampling_ratio/min": 3.252914382301242e-09, "sampling/sampling_logp_difference/max": 2.854118824005127, "sampling/sampling_logp_difference/mean": 0.2999042868614197, "step": 537, "step_time": 23.464268267998705 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 1.7096136957406998, "epoch": 0.01076, "grad_norm": 0.12756933271884918, "kl": 0.4550236202776432, "learning_rate": 7.999626669484177e-06, "loss": -0.0643, "step": 538, "step_time": 11.853447653003968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1537478268146515, "epoch": 0.01078, "frac_reward_zero_std": 0.0, "grad_norm": 0.16154691576957703, "kl": 0.11149059794843197, "learning_rate": 7.999625180661162e-06, "loss": -0.0952, "num_tokens": 12987236.0, "reward": 0.21996092796325684, "reward_std": 0.9492294788360596, "rewards/rollout_reward_func/mean": 0.21996092796325684, "rewards/rollout_reward_func/std": 0.9492294192314148, "sampling/importance_sampling_ratio/max": 1.705773115158081, "sampling/importance_sampling_ratio/mean": 0.7077056169509888, "sampling/importance_sampling_ratio/min": 2.935959491878748e-05, "sampling/sampling_logp_difference/max": 1.9419164657592773, "sampling/sampling_logp_difference/mean": 0.2803880274295807, "step": 539, "step_time": 27.041299217031337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.14905247092247, "epoch": 0.0108, "grad_norm": 0.14130644500255585, "kl": 0.11064320430159569, "learning_rate": 7.99962368887556e-06, "loss": -0.0957, "step": 540, "step_time": 13.011769343022024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.1875, "completions/mean_terminated_length": 4.4666666984558105, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9141375049948692, "epoch": 0.01082, "frac_reward_zero_std": 0.5, "grad_norm": 0.039939604699611664, "kl": 0.43230080232024193, "learning_rate": 7.999622194127377e-06, "loss": -0.0437, "num_tokens": 13035009.0, "reward": 0.7699954509735107, "reward_std": 0.8910703063011169, "rewards/rollout_reward_func/mean": 0.7699954509735107, "rewards/rollout_reward_func/std": 0.8910702466964722, "sampling/importance_sampling_ratio/max": 1.2016839981079102, "sampling/importance_sampling_ratio/mean": 0.9478449821472168, "sampling/importance_sampling_ratio/min": 0.0019355271942913532, "sampling/sampling_logp_difference/max": 1.6028704643249512, "sampling/sampling_logp_difference/mean": 0.15005400776863098, "step": 541, "step_time": 22.92318534301012 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.9047074764966965, "epoch": 0.01084, "grad_norm": 0.031678248196840286, "kl": 0.3973258025944233, "learning_rate": 7.999620696416612e-06, "loss": -0.044, "step": 542, "step_time": 12.525424681021832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1562129110097885, "epoch": 0.01086, "frac_reward_zero_std": 0.125, "grad_norm": 0.024798009544610977, "kl": 0.4041166231036186, "learning_rate": 7.999619195743264e-06, "loss": -0.026, "num_tokens": 13090592.0, "reward": 0.8285502791404724, "reward_std": 0.8703917264938354, "rewards/rollout_reward_func/mean": 0.8285502791404724, "rewards/rollout_reward_func/std": 0.8703917860984802, "sampling/importance_sampling_ratio/max": 1.3511508703231812, "sampling/importance_sampling_ratio/mean": 0.8388510942459106, "sampling/importance_sampling_ratio/min": 0.002013313351199031, "sampling/sampling_logp_difference/max": 1.3205714225769043, "sampling/sampling_logp_difference/mean": 0.18463751673698425, "step": 543, "step_time": 29.428619079990312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1576555371284485, "epoch": 0.01088, "grad_norm": 0.02504989691078663, "kl": 0.3914299048483372, "learning_rate": 7.999617692107338e-06, "loss": -0.026, "step": 544, "step_time": 15.42264686501585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.870967388153076, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1467737406492233, "epoch": 0.0109, "frac_reward_zero_std": 0.125, "grad_norm": 0.14776238799095154, "kl": 0.5840264409780502, "learning_rate": 7.999616185508836e-06, "loss": -0.0303, "num_tokens": 13146280.0, "reward": 0.6564743518829346, "reward_std": 0.8140213489532471, "rewards/rollout_reward_func/mean": 0.6564743518829346, "rewards/rollout_reward_func/std": 0.8140214085578918, "sampling/importance_sampling_ratio/max": 1.4099037647247314, "sampling/importance_sampling_ratio/mean": 0.9213221073150635, "sampling/importance_sampling_ratio/min": 3.2899850339163095e-05, "sampling/sampling_logp_difference/max": 1.6879615783691406, "sampling/sampling_logp_difference/mean": 0.22632598876953125, "step": 545, "step_time": 23.762239122967003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1473852470517159, "epoch": 0.01092, "grad_norm": 0.1418282836675644, "kl": 0.5415317788720131, "learning_rate": 7.999614675947757e-06, "loss": -0.0303, "step": 546, "step_time": 12.564811126037966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.655172348022461, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2989922314882278, "epoch": 0.01094, "frac_reward_zero_std": 0.25, "grad_norm": 0.04593002796173096, "kl": 0.5681069791316986, "learning_rate": 7.999613163424103e-06, "loss": -0.0469, "num_tokens": 13200062.0, "reward": 0.934465229511261, "reward_std": 0.6516063213348389, "rewards/rollout_reward_func/mean": 0.934465229511261, "rewards/rollout_reward_func/std": 0.6516063213348389, "sampling/importance_sampling_ratio/max": 1.605067253112793, "sampling/importance_sampling_ratio/mean": 0.8961511850357056, "sampling/importance_sampling_ratio/min": 1.8374473711446626e-06, "sampling/sampling_logp_difference/max": 2.653139591217041, "sampling/sampling_logp_difference/mean": 0.26545417308807373, "step": 547, "step_time": 22.968716671981383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3024129420518875, "epoch": 0.01096, "grad_norm": 0.04861464723944664, "kl": 0.5210717469453812, "learning_rate": 7.999611647937877e-06, "loss": -0.0469, "step": 548, "step_time": 12.312541321036406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.625, "completions/mean_terminated_length": 4.258064270019531, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6038799732923508, "epoch": 0.01098, "frac_reward_zero_std": 0.5, "grad_norm": 0.024385444819927216, "kl": 0.7116764038801193, "learning_rate": 7.999610129489079e-06, "loss": -0.0418, "num_tokens": 13245368.0, "reward": 0.8180583715438843, "reward_std": 0.8604292869567871, "rewards/rollout_reward_func/mean": 0.8180583715438843, "rewards/rollout_reward_func/std": 0.8604292869567871, "sampling/importance_sampling_ratio/max": 1.3241511583328247, "sampling/importance_sampling_ratio/mean": 0.9481137990951538, "sampling/importance_sampling_ratio/min": 0.00832813698798418, "sampling/sampling_logp_difference/max": 1.4978208541870117, "sampling/sampling_logp_difference/mean": 0.11592315137386322, "step": 549, "step_time": 18.343941524013644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6051986590027809, "epoch": 0.011, "grad_norm": 0.023278405889868736, "kl": 0.687821052968502, "learning_rate": 7.999608608077711e-06, "loss": -0.0419, "step": 550, "step_time": 9.877302774984855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.884615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9031649231910706, "epoch": 0.01102, "frac_reward_zero_std": 0.125, "grad_norm": 0.0876610055565834, "kl": 0.8350556008517742, "learning_rate": 7.999607083703775e-06, "loss": -0.0469, "num_tokens": 13298899.0, "reward": 0.7962609529495239, "reward_std": 0.8349884748458862, "rewards/rollout_reward_func/mean": 0.7962609529495239, "rewards/rollout_reward_func/std": 0.8349884748458862, "sampling/importance_sampling_ratio/max": 1.9344149827957153, "sampling/importance_sampling_ratio/mean": 0.7761427164077759, "sampling/importance_sampling_ratio/min": 1.0188117016696197e-07, "sampling/sampling_logp_difference/max": 2.1573643684387207, "sampling/sampling_logp_difference/mean": 0.34697896242141724, "step": 551, "step_time": 23.248477084038313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9069978594779968, "epoch": 0.01104, "grad_norm": 0.08185651153326035, "kl": 0.795634064823389, "learning_rate": 7.999605556367272e-06, "loss": -0.0471, "step": 552, "step_time": 12.821968526026467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.275862216949463, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9779353588819504, "epoch": 0.01106, "frac_reward_zero_std": 0.125, "grad_norm": 0.06475328654050827, "kl": 0.44878143072128296, "learning_rate": 7.999604026068203e-06, "loss": -0.0642, "num_tokens": 13345908.0, "reward": 0.970379114151001, "reward_std": 0.7010331749916077, "rewards/rollout_reward_func/mean": 0.970379114151001, "rewards/rollout_reward_func/std": 0.7010331749916077, "sampling/importance_sampling_ratio/max": 1.329234004020691, "sampling/importance_sampling_ratio/mean": 0.8460509777069092, "sampling/importance_sampling_ratio/min": 0.0001132612451328896, "sampling/sampling_logp_difference/max": 1.400561809539795, "sampling/sampling_logp_difference/mean": 0.20084229111671448, "step": 553, "step_time": 22.241179330972955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9825340509414673, "epoch": 0.01108, "grad_norm": 0.057614006102085114, "kl": 0.4528149291872978, "learning_rate": 7.99960249280657e-06, "loss": -0.0643, "step": 554, "step_time": 12.092805203981698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 5.300000190734863, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3830100148916245, "epoch": 0.0111, "frac_reward_zero_std": 0.0, "grad_norm": 0.08774225413799286, "kl": 0.5092884860932827, "learning_rate": 7.999600956582375e-06, "loss": -0.069, "num_tokens": 13397649.0, "reward": 0.330369770526886, "reward_std": 0.9137977361679077, "rewards/rollout_reward_func/mean": 0.330369770526886, "rewards/rollout_reward_func/std": 0.9137977361679077, "sampling/importance_sampling_ratio/max": 1.6351056098937988, "sampling/importance_sampling_ratio/mean": 0.8840842247009277, "sampling/importance_sampling_ratio/min": 0.00012769279419444501, "sampling/sampling_logp_difference/max": 2.0156610012054443, "sampling/sampling_logp_difference/mean": 0.24652791023254395, "step": 555, "step_time": 25.293923756020376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.386058747768402, "epoch": 0.01112, "grad_norm": 0.09387297183275223, "kl": 0.49123363569378853, "learning_rate": 7.999599417395618e-06, "loss": -0.0688, "step": 556, "step_time": 12.960580736049451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.35032482072711, "epoch": 0.01114, "frac_reward_zero_std": 0.375, "grad_norm": 0.052463311702013016, "kl": 0.24962110072374344, "learning_rate": 7.999597875246304e-06, "loss": -0.0518, "num_tokens": 13455881.0, "reward": 0.5539238452911377, "reward_std": 0.8811460733413696, "rewards/rollout_reward_func/mean": 0.5539238452911377, "rewards/rollout_reward_func/std": 0.8811461329460144, "sampling/importance_sampling_ratio/max": 1.551816463470459, "sampling/importance_sampling_ratio/mean": 0.8402339220046997, "sampling/importance_sampling_ratio/min": 0.0010613889899104834, "sampling/sampling_logp_difference/max": 1.7456518411636353, "sampling/sampling_logp_difference/mean": 0.21440128982067108, "step": 557, "step_time": 28.291764019988477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.350100465118885, "epoch": 0.01116, "grad_norm": 0.056803926825523376, "kl": 0.24739137291908264, "learning_rate": 7.99959633013443e-06, "loss": -0.052, "step": 558, "step_time": 14.98258492606692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.800000190734863, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.145030990242958, "epoch": 0.01118, "frac_reward_zero_std": 0.25, "grad_norm": 0.04375419393181801, "kl": 0.41437240317463875, "learning_rate": 7.99959478206e-06, "loss": -0.0324, "num_tokens": 13499314.0, "reward": 0.45755520462989807, "reward_std": 0.8481568098068237, "rewards/rollout_reward_func/mean": 0.45755520462989807, "rewards/rollout_reward_func/std": 0.848156750202179, "sampling/importance_sampling_ratio/max": 1.25650155544281, "sampling/importance_sampling_ratio/mean": 0.8360875844955444, "sampling/importance_sampling_ratio/min": 3.116839934591553e-06, "sampling/sampling_logp_difference/max": 2.1987063884735107, "sampling/sampling_logp_difference/mean": 0.24513109028339386, "step": 559, "step_time": 25.940956018021097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1497748419642448, "epoch": 0.0112, "grad_norm": 0.04480281472206116, "kl": 0.4021575227379799, "learning_rate": 7.999593231023017e-06, "loss": -0.0326, "step": 560, "step_time": 14.458656295988476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.78125, "completions/mean_terminated_length": 4.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9649139009416103, "epoch": 0.01122, "frac_reward_zero_std": 0.375, "grad_norm": 0.02951527014374733, "kl": 0.6258630752563477, "learning_rate": 7.999591677023478e-06, "loss": -0.0465, "num_tokens": 13546691.0, "reward": 1.0775861740112305, "reward_std": 0.7112661600112915, "rewards/rollout_reward_func/mean": 1.0775861740112305, "rewards/rollout_reward_func/std": 0.7112661004066467, "sampling/importance_sampling_ratio/max": 1.1736758947372437, "sampling/importance_sampling_ratio/mean": 0.8829044699668884, "sampling/importance_sampling_ratio/min": 7.224232945191034e-07, "sampling/sampling_logp_difference/max": 2.249431610107422, "sampling/sampling_logp_difference/mean": 0.20681101083755493, "step": 561, "step_time": 26.064309915032936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9652276411652565, "epoch": 0.01124, "grad_norm": 0.02874607965350151, "kl": 0.6443635746836662, "learning_rate": 7.999590120061389e-06, "loss": -0.0466, "step": 562, "step_time": 15.03848796300008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 5.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8510020524263382, "epoch": 0.01126, "frac_reward_zero_std": 0.125, "grad_norm": 0.047284457832574844, "kl": 0.41834893077611923, "learning_rate": 7.99958856013675e-06, "loss": -0.0614, "num_tokens": 13596051.0, "reward": 0.7742831707000732, "reward_std": 0.8439435958862305, "rewards/rollout_reward_func/mean": 0.7742831707000732, "rewards/rollout_reward_func/std": 0.8439435958862305, "sampling/importance_sampling_ratio/max": 1.2892364263534546, "sampling/importance_sampling_ratio/mean": 0.7021333575248718, "sampling/importance_sampling_ratio/min": 1.8699631255003624e-05, "sampling/sampling_logp_difference/max": 1.7967113256454468, "sampling/sampling_logp_difference/mean": 0.3300827145576477, "step": 563, "step_time": 23.291491839976516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8517698049545288, "epoch": 0.01128, "grad_norm": 0.04689386114478111, "kl": 0.42816243320703506, "learning_rate": 7.999586997249562e-06, "loss": -0.0614, "step": 564, "step_time": 12.69206561797182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.141026496887207, "epoch": 0.0113, "frac_reward_zero_std": 0.0, "grad_norm": 0.06816111505031586, "kl": 0.33213669434189796, "learning_rate": 7.999585431399826e-06, "loss": -0.0504, "num_tokens": 13648004.0, "reward": 0.2696918547153473, "reward_std": 0.888969898223877, "rewards/rollout_reward_func/mean": 0.2696918547153473, "rewards/rollout_reward_func/std": 0.888969898223877, "sampling/importance_sampling_ratio/max": 1.403732419013977, "sampling/importance_sampling_ratio/mean": 0.7518499493598938, "sampling/importance_sampling_ratio/min": 5.293667527439538e-06, "sampling/sampling_logp_difference/max": 2.133150100708008, "sampling/sampling_logp_difference/mean": 0.33088722825050354, "step": 565, "step_time": 24.8522298099997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.140161454677582, "epoch": 0.01132, "grad_norm": 0.06819755584001541, "kl": 0.31829845160245895, "learning_rate": 7.999583862587546e-06, "loss": -0.0507, "step": 566, "step_time": 12.670965582015924 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 4.2916669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0847955346107483, "epoch": 0.01134, "frac_reward_zero_std": 0.125, "grad_norm": 0.0686340406537056, "kl": 0.35661900602281094, "learning_rate": 7.999582290812721e-06, "loss": -0.0742, "num_tokens": 13699902.0, "reward": 0.25075581669807434, "reward_std": 0.8752051591873169, "rewards/rollout_reward_func/mean": 0.25075581669807434, "rewards/rollout_reward_func/std": 0.8752052187919617, "sampling/importance_sampling_ratio/max": 1.2968599796295166, "sampling/importance_sampling_ratio/mean": 0.70835942029953, "sampling/importance_sampling_ratio/min": 3.4465891076251864e-05, "sampling/sampling_logp_difference/max": 1.8709408044815063, "sampling/sampling_logp_difference/mean": 0.3072100281715393, "step": 567, "step_time": 25.944866314996034 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 2.086367517709732, "epoch": 0.01136, "grad_norm": 0.0767655149102211, "kl": 0.34722356498241425, "learning_rate": 7.999580716075352e-06, "loss": -0.0743, "step": 568, "step_time": 13.25151357895811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.517241477966309, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.147201955318451, "epoch": 0.01138, "frac_reward_zero_std": 0.25, "grad_norm": 0.18301409482955933, "kl": 0.40835798531770706, "learning_rate": 7.999579138375444e-06, "loss": -0.0644, "num_tokens": 13750798.0, "reward": 0.632456362247467, "reward_std": 0.9007640480995178, "rewards/rollout_reward_func/mean": 0.632456362247467, "rewards/rollout_reward_func/std": 0.9007640480995178, "sampling/importance_sampling_ratio/max": 1.5900037288665771, "sampling/importance_sampling_ratio/mean": 0.8612689971923828, "sampling/importance_sampling_ratio/min": 0.00010544259566813707, "sampling/sampling_logp_difference/max": 1.8871088027954102, "sampling/sampling_logp_difference/mean": 0.20677071809768677, "step": 569, "step_time": 26.06751537002856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1471494883298874, "epoch": 0.0114, "grad_norm": 0.1975078582763672, "kl": 0.4055746719241142, "learning_rate": 7.999577557712995e-06, "loss": -0.0647, "step": 570, "step_time": 13.544377095997334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6301787197589874, "epoch": 0.01142, "frac_reward_zero_std": 0.0, "grad_norm": 0.1370149701833725, "kl": 0.18997220695018768, "learning_rate": 7.99957597408801e-06, "loss": -0.0705, "num_tokens": 13806090.0, "reward": 0.5074659585952759, "reward_std": 0.9238597750663757, "rewards/rollout_reward_func/mean": 0.5074659585952759, "rewards/rollout_reward_func/std": 0.923859715461731, "sampling/importance_sampling_ratio/max": 1.5140273571014404, "sampling/importance_sampling_ratio/mean": 0.9367969036102295, "sampling/importance_sampling_ratio/min": 7.140769184843521e-07, "sampling/sampling_logp_difference/max": 1.9890875816345215, "sampling/sampling_logp_difference/mean": 0.3037923276424408, "step": 571, "step_time": 22.05148322301102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6235196888446808, "epoch": 0.01144, "grad_norm": 0.13696607947349548, "kl": 0.1883544884622097, "learning_rate": 7.99957438750049e-06, "loss": -0.0711, "step": 572, "step_time": 11.236106574011501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.675094097852707, "epoch": 0.01146, "frac_reward_zero_std": 0.125, "grad_norm": 0.11821992695331573, "kl": 0.30158702097833157, "learning_rate": 7.999572797950433e-06, "loss": -0.0783, "num_tokens": 13855578.0, "reward": 0.344024121761322, "reward_std": 0.8792535662651062, "rewards/rollout_reward_func/mean": 0.344024121761322, "rewards/rollout_reward_func/std": 0.8792535066604614, "sampling/importance_sampling_ratio/max": 1.5427340269088745, "sampling/importance_sampling_ratio/mean": 0.826412558555603, "sampling/importance_sampling_ratio/min": 1.1055685718019959e-06, "sampling/sampling_logp_difference/max": 1.8621290922164917, "sampling/sampling_logp_difference/mean": 0.3297950327396393, "step": 573, "step_time": 22.45773957305937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.671057567000389, "epoch": 0.01148, "grad_norm": 0.11368881165981293, "kl": 0.3097478002309799, "learning_rate": 7.999571205437842e-06, "loss": -0.0788, "step": 574, "step_time": 12.183750328986207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.477375864982605, "epoch": 0.0115, "frac_reward_zero_std": 0.125, "grad_norm": 0.019131500273942947, "kl": 0.28612951561808586, "learning_rate": 7.999569609962722e-06, "loss": -0.068, "num_tokens": 13903409.0, "reward": 0.824332058429718, "reward_std": 0.9524894952774048, "rewards/rollout_reward_func/mean": 0.824332058429718, "rewards/rollout_reward_func/std": 0.95248943567276, "sampling/importance_sampling_ratio/max": 1.2599400281906128, "sampling/importance_sampling_ratio/mean": 0.8402230143547058, "sampling/importance_sampling_ratio/min": 0.00010585982818156481, "sampling/sampling_logp_difference/max": 1.7387828826904297, "sampling/sampling_logp_difference/mean": 0.25386306643486023, "step": 575, "step_time": 22.80500822304748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0025510203558951616, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025510203558951616, "entropy": 1.4699143022298813, "epoch": 0.01152, "grad_norm": 0.022033391520380974, "kl": 0.30670520663261414, "learning_rate": 7.99956801152507e-06, "loss": -0.068, "step": 576, "step_time": 12.056431384989992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7026359997689724, "epoch": 0.01154, "frac_reward_zero_std": 0.375, "grad_norm": 0.07146520167589188, "kl": 0.31668270006775856, "learning_rate": 7.999566410124892e-06, "loss": -0.013, "num_tokens": 13952799.0, "reward": 0.8122657537460327, "reward_std": 0.7041395902633667, "rewards/rollout_reward_func/mean": 0.8122657537460327, "rewards/rollout_reward_func/std": 0.7041395306587219, "sampling/importance_sampling_ratio/max": 1.3126319646835327, "sampling/importance_sampling_ratio/mean": 1.051222801208496, "sampling/importance_sampling_ratio/min": 6.403106453944929e-07, "sampling/sampling_logp_difference/max": 1.982934594154358, "sampling/sampling_logp_difference/mean": 0.1518601030111313, "step": 577, "step_time": 22.353788392007118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6995865181088448, "epoch": 0.01156, "grad_norm": 0.07573340833187103, "kl": 0.32360298186540604, "learning_rate": 7.999564805762185e-06, "loss": -0.0129, "step": 578, "step_time": 12.000010584975826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1688006222248077, "epoch": 0.01158, "frac_reward_zero_std": 0.25, "grad_norm": 0.09912919253110886, "kl": 0.5850773379206657, "learning_rate": 7.999563198436954e-06, "loss": -0.0241, "num_tokens": 14003921.0, "reward": 0.6237558126449585, "reward_std": 0.8234612941741943, "rewards/rollout_reward_func/mean": 0.6237558126449585, "rewards/rollout_reward_func/std": 0.8234612345695496, "sampling/importance_sampling_ratio/max": 1.4258944988250732, "sampling/importance_sampling_ratio/mean": 0.9118479490280151, "sampling/importance_sampling_ratio/min": 4.355646888143383e-05, "sampling/sampling_logp_difference/max": 2.141261577606201, "sampling/sampling_logp_difference/mean": 0.21840789914131165, "step": 579, "step_time": 24.14886470203055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1748896092176437, "epoch": 0.0116, "grad_norm": 0.09905301034450531, "kl": 0.555860135704279, "learning_rate": 7.9995615881492e-06, "loss": -0.0246, "step": 580, "step_time": 12.532211746962275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3328650780022144, "epoch": 0.01162, "frac_reward_zero_std": 0.375, "grad_norm": 0.040565382689237595, "kl": 0.5672213360667229, "learning_rate": 7.999559974898922e-06, "loss": -0.0634, "num_tokens": 14048310.0, "reward": 0.7558819651603699, "reward_std": 0.9551746249198914, "rewards/rollout_reward_func/mean": 0.7558819651603699, "rewards/rollout_reward_func/std": 0.9551746249198914, "sampling/importance_sampling_ratio/max": 1.2678321599960327, "sampling/importance_sampling_ratio/mean": 0.8132999539375305, "sampling/importance_sampling_ratio/min": 0.000653735944069922, "sampling/sampling_logp_difference/max": 1.5924959182739258, "sampling/sampling_logp_difference/mean": 0.2222401648759842, "step": 581, "step_time": 22.719742513028905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.3360439985990524, "epoch": 0.01164, "grad_norm": 0.04027239978313446, "kl": 0.5365498587489128, "learning_rate": 7.999558358686124e-06, "loss": -0.0634, "step": 582, "step_time": 11.169494655012386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6601975411176682, "epoch": 0.01166, "frac_reward_zero_std": 0.125, "grad_norm": 0.08173540234565735, "kl": 0.639050554484129, "learning_rate": 7.999556739510809e-06, "loss": -0.0777, "num_tokens": 14102178.0, "reward": 0.4743726849555969, "reward_std": 0.8650224208831787, "rewards/rollout_reward_func/mean": 0.4743726849555969, "rewards/rollout_reward_func/std": 0.8650224208831787, "sampling/importance_sampling_ratio/max": 1.3190529346466064, "sampling/importance_sampling_ratio/mean": 0.7146161794662476, "sampling/importance_sampling_ratio/min": 6.688180292258039e-06, "sampling/sampling_logp_difference/max": 2.127532720565796, "sampling/sampling_logp_difference/mean": 0.31683140993118286, "step": 583, "step_time": 25.122296049026772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6589747071266174, "epoch": 0.01168, "grad_norm": 0.08013390749692917, "kl": 0.6184039954096079, "learning_rate": 7.999555117372973e-06, "loss": -0.0777, "step": 584, "step_time": 12.6066497110005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 4.269230842590332, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.138944536447525, "epoch": 0.0117, "frac_reward_zero_std": 0.25, "grad_norm": 0.19565936923027039, "kl": 0.34418684616684914, "learning_rate": 7.999553492272625e-06, "loss": -0.068, "num_tokens": 14158744.0, "reward": 0.7029767036437988, "reward_std": 0.739852786064148, "rewards/rollout_reward_func/mean": 0.7029767036437988, "rewards/rollout_reward_func/std": 0.739852786064148, "sampling/importance_sampling_ratio/max": 1.945508360862732, "sampling/importance_sampling_ratio/mean": 0.9174230694770813, "sampling/importance_sampling_ratio/min": 1.157485392255797e-11, "sampling/sampling_logp_difference/max": 2.8382930755615234, "sampling/sampling_logp_difference/mean": 0.46633821725845337, "step": 585, "step_time": 29.291979837056715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.131661146879196, "epoch": 0.01172, "grad_norm": 0.18819089233875275, "kl": 0.36533910036087036, "learning_rate": 7.999551864209762e-06, "loss": -0.0686, "step": 586, "step_time": 15.483512364997296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1632440835237503, "epoch": 0.01174, "frac_reward_zero_std": 0.5, "grad_norm": 0.04080057889223099, "kl": 0.3717390298843384, "learning_rate": 7.999550233184386e-06, "loss": -0.0424, "num_tokens": 14203998.0, "reward": 0.763816773891449, "reward_std": 0.7996335625648499, "rewards/rollout_reward_func/mean": 0.763816773891449, "rewards/rollout_reward_func/std": 0.7996335625648499, "sampling/importance_sampling_ratio/max": 1.5186705589294434, "sampling/importance_sampling_ratio/mean": 0.9298934936523438, "sampling/importance_sampling_ratio/min": 4.86092810447758e-09, "sampling/sampling_logp_difference/max": 2.014730453491211, "sampling/sampling_logp_difference/mean": 0.265707790851593, "step": 587, "step_time": 20.82536325097317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.155618742108345, "epoch": 0.01176, "grad_norm": 0.06275554746389389, "kl": 0.38610897213220596, "learning_rate": 7.999548599196499e-06, "loss": -0.0424, "step": 588, "step_time": 11.022864889004268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4823988676071167, "epoch": 0.01178, "frac_reward_zero_std": 0.5, "grad_norm": 0.08104754984378815, "kl": 0.40209636837244034, "learning_rate": 7.999546962246104e-06, "loss": -0.0149, "num_tokens": 14245398.0, "reward": 0.6241686344146729, "reward_std": 0.9703801274299622, "rewards/rollout_reward_func/mean": 0.6241686344146729, "rewards/rollout_reward_func/std": 0.9703801274299622, "sampling/importance_sampling_ratio/max": 1.252401351928711, "sampling/importance_sampling_ratio/mean": 0.8640539646148682, "sampling/importance_sampling_ratio/min": 2.4130156361934496e-06, "sampling/sampling_logp_difference/max": 2.265015125274658, "sampling/sampling_logp_difference/mean": 0.23261785507202148, "step": 589, "step_time": 21.95452932099579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4799501597881317, "epoch": 0.0118, "grad_norm": 0.07903237640857697, "kl": 0.39687687158584595, "learning_rate": 7.9995453223332e-06, "loss": -0.015, "step": 590, "step_time": 11.393426812050166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.075499214231968, "epoch": 0.01182, "frac_reward_zero_std": 0.25, "grad_norm": 0.024038415402173996, "kl": 0.3165910020470619, "learning_rate": 7.999543679457792e-06, "loss": -0.0366, "num_tokens": 14294922.0, "reward": 0.7353512048721313, "reward_std": 0.8189472556114197, "rewards/rollout_reward_func/mean": 0.7353512048721313, "rewards/rollout_reward_func/std": 0.8189472556114197, "sampling/importance_sampling_ratio/max": 1.3302741050720215, "sampling/importance_sampling_ratio/mean": 0.9953497648239136, "sampling/importance_sampling_ratio/min": 4.958101840202289e-07, "sampling/sampling_logp_difference/max": 2.2397568225860596, "sampling/sampling_logp_difference/mean": 0.21743053197860718, "step": 591, "step_time": 19.54176142497454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0731277242302895, "epoch": 0.01184, "grad_norm": 0.0222147386521101, "kl": 0.322433415800333, "learning_rate": 7.999542033619878e-06, "loss": -0.0365, "step": 592, "step_time": 10.107914963009534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0701725706458092, "epoch": 0.01186, "frac_reward_zero_std": 0.25, "grad_norm": 0.049580253660678864, "kl": 0.33866169303655624, "learning_rate": 7.999540384819463e-06, "loss": -0.048, "num_tokens": 14347502.0, "reward": 0.47284501791000366, "reward_std": 0.8309923410415649, "rewards/rollout_reward_func/mean": 0.47284501791000366, "rewards/rollout_reward_func/std": 0.8309923410415649, "sampling/importance_sampling_ratio/max": 1.4151946306228638, "sampling/importance_sampling_ratio/mean": 0.8670730590820312, "sampling/importance_sampling_ratio/min": 8.432402864855248e-06, "sampling/sampling_logp_difference/max": 1.9565328359603882, "sampling/sampling_logp_difference/mean": 0.20470115542411804, "step": 593, "step_time": 26.68386750103673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0687872245907784, "epoch": 0.01188, "grad_norm": 0.05126381665468216, "kl": 0.324587170034647, "learning_rate": 7.999538733056544e-06, "loss": -0.0479, "step": 594, "step_time": 13.781329125049524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.5806450843811035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.017708383500576, "epoch": 0.0119, "frac_reward_zero_std": 0.5, "grad_norm": 0.06606928259134293, "kl": 0.2623043581843376, "learning_rate": 7.999537078331127e-06, "loss": -0.0374, "num_tokens": 14393429.0, "reward": 0.14529317617416382, "reward_std": 0.7340584993362427, "rewards/rollout_reward_func/mean": 0.14529317617416382, "rewards/rollout_reward_func/std": 0.7340584993362427, "sampling/importance_sampling_ratio/max": 1.327750563621521, "sampling/importance_sampling_ratio/mean": 0.9944205284118652, "sampling/importance_sampling_ratio/min": 2.904102257161867e-05, "sampling/sampling_logp_difference/max": 1.697263240814209, "sampling/sampling_logp_difference/mean": 0.19034233689308167, "step": 595, "step_time": 19.02147234196309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0152403861284256, "epoch": 0.01192, "grad_norm": 0.07024326175451279, "kl": 0.26086222380399704, "learning_rate": 7.999535420643213e-06, "loss": -0.0373, "step": 596, "step_time": 10.056697990046814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2953735888004303, "epoch": 0.01194, "frac_reward_zero_std": 0.375, "grad_norm": 0.07604225724935532, "kl": 0.824144147336483, "learning_rate": 7.999533759992803e-06, "loss": -0.0428, "num_tokens": 14437710.0, "reward": 0.6449147462844849, "reward_std": 0.7881911993026733, "rewards/rollout_reward_func/mean": 0.6449147462844849, "rewards/rollout_reward_func/std": 0.7881911993026733, "sampling/importance_sampling_ratio/max": 1.3467950820922852, "sampling/importance_sampling_ratio/mean": 0.8685688376426697, "sampling/importance_sampling_ratio/min": 9.955807763617486e-05, "sampling/sampling_logp_difference/max": 1.896111249923706, "sampling/sampling_logp_difference/mean": 0.2589989900588989, "step": 597, "step_time": 22.582027941010892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2968421280384064, "epoch": 0.01196, "grad_norm": 0.06993883848190308, "kl": 0.790953166782856, "learning_rate": 7.999532096379897e-06, "loss": -0.043, "step": 598, "step_time": 12.150841051043244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3871600925922394, "epoch": 0.01198, "frac_reward_zero_std": 0.125, "grad_norm": 0.061958979815244675, "kl": 0.31631014123559, "learning_rate": 7.999530429804498e-06, "loss": -0.0567, "num_tokens": 14492635.0, "reward": 0.6524503231048584, "reward_std": 0.9171168208122253, "rewards/rollout_reward_func/mean": 0.6524503231048584, "rewards/rollout_reward_func/std": 0.9171168208122253, "sampling/importance_sampling_ratio/max": 1.282955527305603, "sampling/importance_sampling_ratio/mean": 0.8603212833404541, "sampling/importance_sampling_ratio/min": 0.00021159845346119255, "sampling/sampling_logp_difference/max": 1.676994800567627, "sampling/sampling_logp_difference/mean": 0.21247506141662598, "step": 599, "step_time": 25.156937316962285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3926382660865784, "epoch": 0.012, "grad_norm": 0.061991363763809204, "kl": 0.31591612100601196, "learning_rate": 7.999528760266607e-06, "loss": -0.0566, "step": 600, "step_time": 12.475890830013668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.407407283782959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5060130059719086, "epoch": 0.01202, "frac_reward_zero_std": 0.125, "grad_norm": 0.06245148181915283, "kl": 0.3201846703886986, "learning_rate": 7.99952708776623e-06, "loss": -0.0601, "num_tokens": 14545442.0, "reward": 0.5878309011459351, "reward_std": 0.9402400851249695, "rewards/rollout_reward_func/mean": 0.5878309011459351, "rewards/rollout_reward_func/std": 0.9402400255203247, "sampling/importance_sampling_ratio/max": 1.1919763088226318, "sampling/importance_sampling_ratio/mean": 0.8056904077529907, "sampling/importance_sampling_ratio/min": 0.0008888222509995103, "sampling/sampling_logp_difference/max": 1.8376964330673218, "sampling/sampling_logp_difference/mean": 0.22870692610740662, "step": 601, "step_time": 22.978086381976027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5068408250808716, "epoch": 0.01204, "grad_norm": 0.0658877044916153, "kl": 0.30196306854486465, "learning_rate": 7.999525412303364e-06, "loss": -0.0603, "step": 602, "step_time": 12.100167483004043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.576923370361328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9631384313106537, "epoch": 0.01206, "frac_reward_zero_std": 0.0, "grad_norm": 0.0836118757724762, "kl": 0.3539353348314762, "learning_rate": 7.99952373387801e-06, "loss": -0.0733, "num_tokens": 14607213.0, "reward": 0.2533046007156372, "reward_std": 0.8035767078399658, "rewards/rollout_reward_func/mean": 0.2533046007156372, "rewards/rollout_reward_func/std": 0.8035767078399658, "sampling/importance_sampling_ratio/max": 1.3630921840667725, "sampling/importance_sampling_ratio/mean": 0.7510389685630798, "sampling/importance_sampling_ratio/min": 1.2936141047248384e-07, "sampling/sampling_logp_difference/max": 2.336544990539551, "sampling/sampling_logp_difference/mean": 0.3318808674812317, "step": 603, "step_time": 30.286871721036732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9716594815254211, "epoch": 0.01208, "grad_norm": 0.08828490972518921, "kl": 0.3379824236035347, "learning_rate": 7.999522052490171e-06, "loss": -0.0737, "step": 604, "step_time": 14.938330515986308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.250682920217514, "epoch": 0.0121, "frac_reward_zero_std": 0.125, "grad_norm": 0.06290841102600098, "kl": 0.4687911421060562, "learning_rate": 7.99952036813985e-06, "loss": -0.0783, "num_tokens": 14650730.0, "reward": 0.5479092597961426, "reward_std": 0.8254054188728333, "rewards/rollout_reward_func/mean": 0.5479092597961426, "rewards/rollout_reward_func/std": 0.8254054188728333, "sampling/importance_sampling_ratio/max": 1.7206127643585205, "sampling/importance_sampling_ratio/mean": 0.8471285104751587, "sampling/importance_sampling_ratio/min": 1.3933323543824372e-06, "sampling/sampling_logp_difference/max": 1.9442006349563599, "sampling/sampling_logp_difference/mean": 0.22302642464637756, "step": 605, "step_time": 27.721177784987958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.265324130654335, "epoch": 0.01212, "grad_norm": 0.0714358538389206, "kl": 0.43373803794384, "learning_rate": 7.999518680827047e-06, "loss": -0.0782, "step": 606, "step_time": 14.644599271006882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.129719614982605, "epoch": 0.01214, "frac_reward_zero_std": 0.0, "grad_norm": 0.037050746381282806, "kl": 0.2622334733605385, "learning_rate": 7.999516990551766e-06, "loss": -0.0528, "num_tokens": 14706898.0, "reward": 0.511856734752655, "reward_std": 0.8929914236068726, "rewards/rollout_reward_func/mean": 0.511856734752655, "rewards/rollout_reward_func/std": 0.8929914236068726, "sampling/importance_sampling_ratio/max": 1.2035194635391235, "sampling/importance_sampling_ratio/mean": 0.7392221689224243, "sampling/importance_sampling_ratio/min": 1.0905677072514663e-06, "sampling/sampling_logp_difference/max": 2.10268497467041, "sampling/sampling_logp_difference/mean": 0.33420097827911377, "step": 607, "step_time": 27.21217205602443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0037878789007663727, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037878789007663727, "entropy": 2.140593409538269, "epoch": 0.01216, "grad_norm": 0.035110216587781906, "kl": 0.2696530967950821, "learning_rate": 7.999515297314007e-06, "loss": -0.0527, "step": 608, "step_time": 14.064514678990236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3941213563084602, "epoch": 0.01218, "frac_reward_zero_std": 0.125, "grad_norm": 0.072840616106987, "kl": 0.18253653682768345, "learning_rate": 7.99951360111377e-06, "loss": -0.0742, "num_tokens": 14761075.0, "reward": 0.26172691583633423, "reward_std": 0.9224830865859985, "rewards/rollout_reward_func/mean": 0.26172691583633423, "rewards/rollout_reward_func/std": 0.9224830865859985, "sampling/importance_sampling_ratio/max": 1.1873962879180908, "sampling/importance_sampling_ratio/mean": 0.7908101081848145, "sampling/importance_sampling_ratio/min": 0.00029854141757823527, "sampling/sampling_logp_difference/max": 1.3710784912109375, "sampling/sampling_logp_difference/mean": 0.23211708664894104, "step": 609, "step_time": 29.657406139973318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3951872996985912, "epoch": 0.0122, "grad_norm": 0.06747329980134964, "kl": 0.18493002839386463, "learning_rate": 7.999511901951059e-06, "loss": -0.0744, "step": 610, "step_time": 14.353908781020436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9672441482543945, "epoch": 0.01222, "frac_reward_zero_std": 0.0, "grad_norm": 0.11055363714694977, "kl": 0.3082951419055462, "learning_rate": 7.999510199825875e-06, "loss": -0.078, "num_tokens": 14818826.0, "reward": 0.20148822665214539, "reward_std": 0.8432486057281494, "rewards/rollout_reward_func/mean": 0.20148822665214539, "rewards/rollout_reward_func/std": 0.8432485461235046, "sampling/importance_sampling_ratio/max": 1.532283902168274, "sampling/importance_sampling_ratio/mean": 0.6737741827964783, "sampling/importance_sampling_ratio/min": 0.0007137706852518022, "sampling/sampling_logp_difference/max": 2.050096035003662, "sampling/sampling_logp_difference/mean": 0.27446454763412476, "step": 611, "step_time": 29.60790379001992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.9722878634929657, "epoch": 0.01224, "grad_norm": 0.11620081961154938, "kl": 0.30770764127373695, "learning_rate": 7.99950849473822e-06, "loss": -0.0781, "step": 612, "step_time": 14.269356661010534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7030817121267319, "epoch": 0.01226, "frac_reward_zero_std": 0.0, "grad_norm": 0.1843535155057907, "kl": 0.8762261159718037, "learning_rate": 7.999506786688096e-06, "loss": -0.073, "num_tokens": 14874609.0, "reward": 0.45644983649253845, "reward_std": 0.8672616481781006, "rewards/rollout_reward_func/mean": 0.45644983649253845, "rewards/rollout_reward_func/std": 0.8672616481781006, "sampling/importance_sampling_ratio/max": 1.370222806930542, "sampling/importance_sampling_ratio/mean": 0.7124098539352417, "sampling/importance_sampling_ratio/min": 4.90916931994434e-07, "sampling/sampling_logp_difference/max": 2.164224147796631, "sampling/sampling_logp_difference/mean": 0.3820439875125885, "step": 613, "step_time": 27.32956457103137 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 1.7067149877548218, "epoch": 0.01228, "grad_norm": 0.15180832147598267, "kl": 0.8016882315278053, "learning_rate": 7.999505075675503e-06, "loss": -0.0738, "step": 614, "step_time": 12.972835782013135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.65625, "completions/mean_terminated_length": 5.319999694824219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3075550496578217, "epoch": 0.0123, "frac_reward_zero_std": 0.0, "grad_norm": 0.1003868505358696, "kl": 0.7925223037600517, "learning_rate": 7.999503361700445e-06, "loss": -0.0649, "num_tokens": 14926132.0, "reward": 0.13634400069713593, "reward_std": 0.8147754073143005, "rewards/rollout_reward_func/mean": 0.13634400069713593, "rewards/rollout_reward_func/std": 0.8147754073143005, "sampling/importance_sampling_ratio/max": 1.1955710649490356, "sampling/importance_sampling_ratio/mean": 0.5460975170135498, "sampling/importance_sampling_ratio/min": 6.779416708013741e-07, "sampling/sampling_logp_difference/max": 1.9635593891143799, "sampling/sampling_logp_difference/mean": 0.3691813349723816, "step": 615, "step_time": 27.784645949985133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3147368133068085, "epoch": 0.01232, "grad_norm": 0.08476968109607697, "kl": 0.649327352643013, "learning_rate": 7.99950164476292e-06, "loss": -0.0654, "step": 616, "step_time": 12.501808521978091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 8.28125, "completions/mean_terminated_length": 4.238095283508301, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0130199939012527, "epoch": 0.01234, "frac_reward_zero_std": 0.0, "grad_norm": 0.10376418381929398, "kl": 0.4527351185679436, "learning_rate": 7.999499924862935e-06, "loss": -0.0683, "num_tokens": 14986495.0, "reward": 0.30166733264923096, "reward_std": 0.914475679397583, "rewards/rollout_reward_func/mean": 0.30166733264923096, "rewards/rollout_reward_func/std": 0.914475679397583, "sampling/importance_sampling_ratio/max": 1.667893648147583, "sampling/importance_sampling_ratio/mean": 0.6620603203773499, "sampling/importance_sampling_ratio/min": 2.804072494200227e-07, "sampling/sampling_logp_difference/max": 1.9334560632705688, "sampling/sampling_logp_difference/mean": 0.34498029947280884, "step": 617, "step_time": 29.828703286970267 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 2.0267962217330933, "epoch": 0.01236, "grad_norm": 0.08519843965768814, "kl": 0.40770816802978516, "learning_rate": 7.999498202000486e-06, "loss": -0.0689, "step": 618, "step_time": 14.351106911053648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2944580912590027, "epoch": 0.01238, "frac_reward_zero_std": 0.125, "grad_norm": 0.04580199345946312, "kl": 0.4832926318049431, "learning_rate": 7.999496476175581e-06, "loss": -0.0526, "num_tokens": 15036770.0, "reward": 0.6684806942939758, "reward_std": 0.870858371257782, "rewards/rollout_reward_func/mean": 0.6684806942939758, "rewards/rollout_reward_func/std": 0.870858371257782, "sampling/importance_sampling_ratio/max": 1.253013014793396, "sampling/importance_sampling_ratio/mean": 0.8462755680084229, "sampling/importance_sampling_ratio/min": 2.3319789761444554e-05, "sampling/sampling_logp_difference/max": 1.8476550579071045, "sampling/sampling_logp_difference/mean": 0.21199588477611542, "step": 619, "step_time": 24.646833659004187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.302478939294815, "epoch": 0.0124, "grad_norm": 0.03114585019648075, "kl": 0.4598294720053673, "learning_rate": 7.999494747388215e-06, "loss": -0.0527, "step": 620, "step_time": 12.17701236900757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 4.639999866485596, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2025553584098816, "epoch": 0.01242, "frac_reward_zero_std": 0.0, "grad_norm": 0.160451278090477, "kl": 0.7084700092673302, "learning_rate": 7.999493015638396e-06, "loss": -0.0714, "num_tokens": 15092514.0, "reward": 0.4234066903591156, "reward_std": 0.8921055197715759, "rewards/rollout_reward_func/mean": 0.4234066903591156, "rewards/rollout_reward_func/std": 0.8921054601669312, "sampling/importance_sampling_ratio/max": 1.6306078433990479, "sampling/importance_sampling_ratio/mean": 0.6423841714859009, "sampling/importance_sampling_ratio/min": 5.281063977236045e-07, "sampling/sampling_logp_difference/max": 2.164001226425171, "sampling/sampling_logp_difference/mean": 0.40099334716796875, "step": 621, "step_time": 28.225071749999188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.2103733122348785, "epoch": 0.01244, "grad_norm": 0.14162279665470123, "kl": 0.6151915192604065, "learning_rate": 7.999491280926121e-06, "loss": -0.0718, "step": 622, "step_time": 14.452784896013327 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 5.551723957061768, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7271623611450195, "epoch": 0.01246, "frac_reward_zero_std": 0.25, "grad_norm": 0.08730985224246979, "kl": 0.42554599046707153, "learning_rate": 7.999489543251393e-06, "loss": -0.051, "num_tokens": 15142575.0, "reward": 0.40300121903419495, "reward_std": 0.8358184695243835, "rewards/rollout_reward_func/mean": 0.40300121903419495, "rewards/rollout_reward_func/std": 0.8358184695243835, "sampling/importance_sampling_ratio/max": 1.4563953876495361, "sampling/importance_sampling_ratio/mean": 0.7912628054618835, "sampling/importance_sampling_ratio/min": 1.7880276459436573e-07, "sampling/sampling_logp_difference/max": 1.9691001176834106, "sampling/sampling_logp_difference/mean": 0.33013319969177246, "step": 623, "step_time": 23.172432368039154 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.732508935034275, "epoch": 0.01248, "grad_norm": 0.0874401405453682, "kl": 0.4194940961897373, "learning_rate": 7.999487802614216e-06, "loss": -0.051, "step": 624, "step_time": 11.856422121025389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 4.4166669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.037731871008873, "epoch": 0.0125, "frac_reward_zero_std": 0.25, "grad_norm": 0.08612816780805588, "kl": 0.3828126806765795, "learning_rate": 7.999486059014588e-06, "loss": -0.0839, "num_tokens": 15197976.0, "reward": 0.3146930932998657, "reward_std": 0.9470716714859009, "rewards/rollout_reward_func/mean": 0.3146930932998657, "rewards/rollout_reward_func/std": 0.9470716714859009, "sampling/importance_sampling_ratio/max": 1.44766366481781, "sampling/importance_sampling_ratio/mean": 0.6534232497215271, "sampling/importance_sampling_ratio/min": 3.3695148886181414e-05, "sampling/sampling_logp_difference/max": 1.876952052116394, "sampling/sampling_logp_difference/mean": 0.31924036145210266, "step": 625, "step_time": 28.109615505003603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.042749732732773, "epoch": 0.01252, "grad_norm": 0.08482646942138672, "kl": 0.39780546724796295, "learning_rate": 7.999484312452514e-06, "loss": -0.0839, "step": 626, "step_time": 12.607224239007337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.1875, "completions/mean_terminated_length": 6.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5666000843048096, "epoch": 0.01254, "frac_reward_zero_std": 0.125, "grad_norm": 0.13058826327323914, "kl": 0.17694589495658875, "learning_rate": 7.999482562927993e-06, "loss": -0.0736, "num_tokens": 15252813.0, "reward": 0.34561920166015625, "reward_std": 0.9532734751701355, "rewards/rollout_reward_func/mean": 0.34561920166015625, "rewards/rollout_reward_func/std": 0.9532734751701355, "sampling/importance_sampling_ratio/max": 1.354502558708191, "sampling/importance_sampling_ratio/mean": 0.5847890377044678, "sampling/importance_sampling_ratio/min": 8.668273920875436e-08, "sampling/sampling_logp_difference/max": 1.8076255321502686, "sampling/sampling_logp_difference/mean": 0.37899577617645264, "step": 627, "step_time": 33.85142021201318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.567971110343933, "epoch": 0.01256, "grad_norm": 0.12159623205661774, "kl": 0.18016375973820686, "learning_rate": 7.999480810441028e-06, "loss": -0.074, "step": 628, "step_time": 17.71778120897943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.625, "completions/mean_terminated_length": 5.692307949066162, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3226044476032257, "epoch": 0.01258, "frac_reward_zero_std": 0.0, "grad_norm": 0.12922322750091553, "kl": 0.7662292215973139, "learning_rate": 7.999479054991623e-06, "loss": -0.048, "num_tokens": 15304894.0, "reward": 0.24025380611419678, "reward_std": 0.8485355973243713, "rewards/rollout_reward_func/mean": 0.24025380611419678, "rewards/rollout_reward_func/std": 0.8485355973243713, "sampling/importance_sampling_ratio/max": 1.4813023805618286, "sampling/importance_sampling_ratio/mean": 0.540473222732544, "sampling/importance_sampling_ratio/min": 1.0426097105664667e-06, "sampling/sampling_logp_difference/max": 2.0235021114349365, "sampling/sampling_logp_difference/mean": 0.40892964601516724, "step": 629, "step_time": 28.13611749597476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.314581334590912, "epoch": 0.0126, "grad_norm": 0.11718084663152695, "kl": 0.7777796201407909, "learning_rate": 7.999477296579775e-06, "loss": -0.0485, "step": 630, "step_time": 12.593492091022199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1648819744586945, "epoch": 0.01262, "frac_reward_zero_std": 0.125, "grad_norm": 0.2021998018026352, "kl": 0.27665494196116924, "learning_rate": 7.99947553520549e-06, "loss": -0.0748, "num_tokens": 15367098.0, "reward": 0.27793192863464355, "reward_std": 0.9560502171516418, "rewards/rollout_reward_func/mean": 0.27793192863464355, "rewards/rollout_reward_func/std": 0.9560502171516418, "sampling/importance_sampling_ratio/max": 2.0328567028045654, "sampling/importance_sampling_ratio/mean": 0.6106616258621216, "sampling/importance_sampling_ratio/min": 4.9779984578890435e-08, "sampling/sampling_logp_difference/max": 1.6750733852386475, "sampling/sampling_logp_difference/mean": 0.3524184823036194, "step": 631, "step_time": 30.279499767988455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 2.1461209654808044, "epoch": 0.01264, "grad_norm": 0.20502367615699768, "kl": 0.2985377851873636, "learning_rate": 7.999473770868766e-06, "loss": -0.0766, "step": 632, "step_time": 14.698416001017904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 5.032258033752441, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2514598965644836, "epoch": 0.01266, "frac_reward_zero_std": 0.25, "grad_norm": 0.03610309213399887, "kl": 0.4158504791557789, "learning_rate": 7.999472003569607e-06, "loss": -0.0421, "num_tokens": 15418288.0, "reward": 0.9318621158599854, "reward_std": 0.6771441102027893, "rewards/rollout_reward_func/mean": 0.9318621158599854, "rewards/rollout_reward_func/std": 0.6771440505981445, "sampling/importance_sampling_ratio/max": 1.3182486295700073, "sampling/importance_sampling_ratio/mean": 0.8970247507095337, "sampling/importance_sampling_ratio/min": 0.00030590686947107315, "sampling/sampling_logp_difference/max": 1.7683987617492676, "sampling/sampling_logp_difference/mean": 0.21538791060447693, "step": 633, "step_time": 24.01461747000576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2380684912204742, "epoch": 0.01268, "grad_norm": 0.03373964875936508, "kl": 0.41307035833597183, "learning_rate": 7.999470233308015e-06, "loss": -0.0421, "step": 634, "step_time": 12.35459088702919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.666666507720947, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5973729193210602, "epoch": 0.0127, "frac_reward_zero_std": 0.25, "grad_norm": 0.057777367532253265, "kl": 0.8368481546640396, "learning_rate": 7.999468460083992e-06, "loss": -0.0533, "num_tokens": 15475048.0, "reward": 0.49123725295066833, "reward_std": 0.8145251870155334, "rewards/rollout_reward_func/mean": 0.49123725295066833, "rewards/rollout_reward_func/std": 0.8145251274108887, "sampling/importance_sampling_ratio/max": 1.2077100276947021, "sampling/importance_sampling_ratio/mean": 0.7120595574378967, "sampling/importance_sampling_ratio/min": 7.491424912586808e-05, "sampling/sampling_logp_difference/max": 2.0707767009735107, "sampling/sampling_logp_difference/mean": 0.26118677854537964, "step": 635, "step_time": 25.95458351695561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5883207619190216, "epoch": 0.01272, "grad_norm": 0.054175540804862976, "kl": 0.8095605373382568, "learning_rate": 7.999466683897538e-06, "loss": -0.0536, "step": 636, "step_time": 12.996607854991453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.379310131072998, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2080968245863914, "epoch": 0.01274, "frac_reward_zero_std": 0.25, "grad_norm": 0.07392999529838562, "kl": 0.42572132870554924, "learning_rate": 7.999464904748656e-06, "loss": -0.0571, "num_tokens": 15521285.0, "reward": 0.8663508892059326, "reward_std": 0.7898703217506409, "rewards/rollout_reward_func/mean": 0.8663508892059326, "rewards/rollout_reward_func/std": 0.7898702621459961, "sampling/importance_sampling_ratio/max": 1.7037684917449951, "sampling/importance_sampling_ratio/mean": 0.8864027261734009, "sampling/importance_sampling_ratio/min": 5.161385843166499e-07, "sampling/sampling_logp_difference/max": 2.025653123855591, "sampling/sampling_logp_difference/mean": 0.25934940576553345, "step": 637, "step_time": 23.192564277007477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.202439859509468, "epoch": 0.01276, "grad_norm": 0.07131126523017883, "kl": 0.43459997698664665, "learning_rate": 7.999463122637347e-06, "loss": -0.057, "step": 638, "step_time": 11.25478688898147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.806451320648193, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9281025007367134, "epoch": 0.01278, "frac_reward_zero_std": 0.25, "grad_norm": 0.09558572620153427, "kl": 0.5031001418828964, "learning_rate": 7.999461337563614e-06, "loss": -0.0475, "num_tokens": 15576111.0, "reward": 1.0190860033035278, "reward_std": 0.6348373293876648, "rewards/rollout_reward_func/mean": 1.0190860033035278, "rewards/rollout_reward_func/std": 0.6348373293876648, "sampling/importance_sampling_ratio/max": 1.124040961265564, "sampling/importance_sampling_ratio/mean": 0.8541474342346191, "sampling/importance_sampling_ratio/min": 2.2909125618753023e-05, "sampling/sampling_logp_difference/max": 1.7988003492355347, "sampling/sampling_logp_difference/mean": 0.19595205783843994, "step": 639, "step_time": 23.031778046977706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9175268337130547, "epoch": 0.0128, "grad_norm": 0.09482042491436005, "kl": 0.5147890746593475, "learning_rate": 7.999459549527458e-06, "loss": -0.0476, "step": 640, "step_time": 12.272379611968063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.035714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1515282951295376, "epoch": 0.01282, "frac_reward_zero_std": 0.25, "grad_norm": 0.0755336731672287, "kl": 0.3220389820635319, "learning_rate": 7.99945775852888e-06, "loss": -0.0445, "num_tokens": 15625877.0, "reward": 0.6679611802101135, "reward_std": 0.8527466654777527, "rewards/rollout_reward_func/mean": 0.6679611802101135, "rewards/rollout_reward_func/std": 0.8527466654777527, "sampling/importance_sampling_ratio/max": 1.5548999309539795, "sampling/importance_sampling_ratio/mean": 0.8904337882995605, "sampling/importance_sampling_ratio/min": 6.212003881955752e-06, "sampling/sampling_logp_difference/max": 2.0125346183776855, "sampling/sampling_logp_difference/mean": 0.24270027875900269, "step": 641, "step_time": 24.297673223016318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.150095395743847, "epoch": 0.01284, "grad_norm": 0.07768074423074722, "kl": 0.3212229013442993, "learning_rate": 7.999455964567883e-06, "loss": -0.0448, "step": 642, "step_time": 12.037888229009695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.44813871942460537, "epoch": 0.01286, "frac_reward_zero_std": 0.5, "grad_norm": 0.019436843693256378, "kl": 0.28273091092705727, "learning_rate": 7.999454167644469e-06, "loss": -0.0226, "num_tokens": 15678198.0, "reward": 0.8278160095214844, "reward_std": 0.7212541103363037, "rewards/rollout_reward_func/mean": 0.8278160095214844, "rewards/rollout_reward_func/std": 0.7212541103363037, "sampling/importance_sampling_ratio/max": 1.1237376928329468, "sampling/importance_sampling_ratio/mean": 1.0123308897018433, "sampling/importance_sampling_ratio/min": 3.407842086744495e-05, "sampling/sampling_logp_difference/max": 1.470442533493042, "sampling/sampling_logp_difference/mean": 0.0962212011218071, "step": 643, "step_time": 19.28825918896473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.443360498175025, "epoch": 0.01288, "grad_norm": 0.01761561632156372, "kl": 0.28620560467243195, "learning_rate": 7.999452367758637e-06, "loss": -0.0227, "step": 644, "step_time": 10.053988870000467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2220691591501236, "epoch": 0.0129, "frac_reward_zero_std": 0.0, "grad_norm": 0.23931573331356049, "kl": 0.5346290674060583, "learning_rate": 7.999450564910393e-06, "loss": -0.0516, "num_tokens": 15737379.0, "reward": 0.6647263169288635, "reward_std": 0.7990368008613586, "rewards/rollout_reward_func/mean": 0.6647263169288635, "rewards/rollout_reward_func/std": 0.7990367412567139, "sampling/importance_sampling_ratio/max": 2.265681505203247, "sampling/importance_sampling_ratio/mean": 0.8282722234725952, "sampling/importance_sampling_ratio/min": 0.0003051871317438781, "sampling/sampling_logp_difference/max": 1.7769005298614502, "sampling/sampling_logp_difference/mean": 0.2351367175579071, "step": 645, "step_time": 23.91128883996862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2142047807574272, "epoch": 0.01292, "grad_norm": 0.21465331315994263, "kl": 0.5578869245946407, "learning_rate": 7.999448759099734e-06, "loss": -0.0527, "step": 646, "step_time": 12.405981192016043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.387096405029297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7848729938268661, "epoch": 0.01294, "frac_reward_zero_std": 0.25, "grad_norm": 0.055218081921339035, "kl": 0.5814726799726486, "learning_rate": 7.999446950326668e-06, "loss": -0.0536, "num_tokens": 15783806.0, "reward": 0.9098113775253296, "reward_std": 0.7709308862686157, "rewards/rollout_reward_func/mean": 0.9098113775253296, "rewards/rollout_reward_func/std": 0.770930826663971, "sampling/importance_sampling_ratio/max": 1.2504587173461914, "sampling/importance_sampling_ratio/mean": 0.8689265251159668, "sampling/importance_sampling_ratio/min": 0.006566135212779045, "sampling/sampling_logp_difference/max": 1.7202134132385254, "sampling/sampling_logp_difference/mean": 0.1518675982952118, "step": 647, "step_time": 19.91672217301675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 0.7930992171168327, "epoch": 0.01296, "grad_norm": 0.05225754156708717, "kl": 0.6035823747515678, "learning_rate": 7.999445138591192e-06, "loss": -0.0538, "step": 648, "step_time": 10.724829480983317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8986048474907875, "epoch": 0.01298, "frac_reward_zero_std": 0.125, "grad_norm": 0.0704539567232132, "kl": 0.4151359125971794, "learning_rate": 7.999443323893308e-06, "loss": -0.0477, "num_tokens": 15842460.0, "reward": 0.44394516944885254, "reward_std": 0.7965427041053772, "rewards/rollout_reward_func/mean": 0.44394516944885254, "rewards/rollout_reward_func/std": 0.7965427041053772, "sampling/importance_sampling_ratio/max": 1.6586425304412842, "sampling/importance_sampling_ratio/mean": 0.8825705647468567, "sampling/importance_sampling_ratio/min": 1.6350779787899228e-06, "sampling/sampling_logp_difference/max": 1.9065879583358765, "sampling/sampling_logp_difference/mean": 0.21223823726177216, "step": 649, "step_time": 26.61622931898455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9030275717377663, "epoch": 0.013, "grad_norm": 0.07289323955774307, "kl": 0.4409713186323643, "learning_rate": 7.99944150623302e-06, "loss": -0.048, "step": 650, "step_time": 15.091048383008456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8794125039130449, "epoch": 0.01302, "frac_reward_zero_std": 0.375, "grad_norm": 0.09737718850374222, "kl": 0.4607542008161545, "learning_rate": 7.999439685610326e-06, "loss": -0.0291, "num_tokens": 15898327.0, "reward": 0.8050496578216553, "reward_std": 0.7395010590553284, "rewards/rollout_reward_func/mean": 0.8050496578216553, "rewards/rollout_reward_func/std": 0.7395009994506836, "sampling/importance_sampling_ratio/max": 1.189726710319519, "sampling/importance_sampling_ratio/mean": 0.9166607856750488, "sampling/importance_sampling_ratio/min": 1.656099897218155e-07, "sampling/sampling_logp_difference/max": 1.730009913444519, "sampling/sampling_logp_difference/mean": 0.19711247086524963, "step": 651, "step_time": 31.280367694038432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.895474087446928, "epoch": 0.01304, "grad_norm": 0.08937954902648926, "kl": 0.47213052958250046, "learning_rate": 7.999437862025232e-06, "loss": -0.0293, "step": 652, "step_time": 17.656289748993004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.8275861740112305, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1915734484791756, "epoch": 0.01306, "frac_reward_zero_std": 0.25, "grad_norm": 0.04800831153988838, "kl": 0.4493953175842762, "learning_rate": 7.999436035477738e-06, "loss": -0.0401, "num_tokens": 15951410.0, "reward": 0.9730976819992065, "reward_std": 0.6625683307647705, "rewards/rollout_reward_func/mean": 0.9730976819992065, "rewards/rollout_reward_func/std": 0.6625682711601257, "sampling/importance_sampling_ratio/max": 1.1137903928756714, "sampling/importance_sampling_ratio/mean": 0.8604781627655029, "sampling/importance_sampling_ratio/min": 1.5646627105070365e-07, "sampling/sampling_logp_difference/max": 2.222379446029663, "sampling/sampling_logp_difference/mean": 0.24564622342586517, "step": 653, "step_time": 22.967592642031377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1968053113669157, "epoch": 0.01308, "grad_norm": 0.060616735368967056, "kl": 0.43232959508895874, "learning_rate": 7.999434205967846e-06, "loss": -0.0404, "step": 654, "step_time": 12.055220306036063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.967741966247559, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2099148938432336, "epoch": 0.0131, "frac_reward_zero_std": 0.125, "grad_norm": 0.0964663028717041, "kl": 0.4707539454102516, "learning_rate": 7.999432373495559e-06, "loss": -0.0361, "num_tokens": 16003886.0, "reward": 0.3838632106781006, "reward_std": 0.7347189784049988, "rewards/rollout_reward_func/mean": 0.3838632106781006, "rewards/rollout_reward_func/std": 0.7347189784049988, "sampling/importance_sampling_ratio/max": 1.402877926826477, "sampling/importance_sampling_ratio/mean": 0.7896133661270142, "sampling/importance_sampling_ratio/min": 0.0004272454243618995, "sampling/sampling_logp_difference/max": 1.8740670680999756, "sampling/sampling_logp_difference/mean": 0.2292020618915558, "step": 655, "step_time": 22.907603664993076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013247282709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013247282709926367, "entropy": 1.2311033112928271, "epoch": 0.01312, "grad_norm": 0.09990929067134857, "kl": 0.4534991756081581, "learning_rate": 7.999430538060875e-06, "loss": -0.0365, "step": 656, "step_time": 12.325210421025986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2428657859563828, "epoch": 0.01314, "frac_reward_zero_std": 0.0, "grad_norm": 0.07634939253330231, "kl": 0.7205159291625023, "learning_rate": 7.999428699663802e-06, "loss": -0.0422, "num_tokens": 16067113.0, "reward": 0.5217165946960449, "reward_std": 0.706849217414856, "rewards/rollout_reward_func/mean": 0.5217165946960449, "rewards/rollout_reward_func/std": 0.7068492770195007, "sampling/importance_sampling_ratio/max": 1.7177469730377197, "sampling/importance_sampling_ratio/mean": 0.8235355615615845, "sampling/importance_sampling_ratio/min": 0.001564962905831635, "sampling/sampling_logp_difference/max": 1.6770710945129395, "sampling/sampling_logp_difference/mean": 0.22117188572883606, "step": 657, "step_time": 27.613374374981504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 1.2720893025398254, "epoch": 0.01316, "grad_norm": 0.06569357961416245, "kl": 0.6534224972128868, "learning_rate": 7.999426858304336e-06, "loss": -0.043, "step": 658, "step_time": 15.046496717986884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6950805932283401, "epoch": 0.01318, "frac_reward_zero_std": 0.125, "grad_norm": 0.12808696925640106, "kl": 0.5315034985542297, "learning_rate": 7.99942501398248e-06, "loss": -0.0554, "num_tokens": 16119712.0, "reward": 0.4651361405849457, "reward_std": 0.8445784449577332, "rewards/rollout_reward_func/mean": 0.4651361405849457, "rewards/rollout_reward_func/std": 0.8445783853530884, "sampling/importance_sampling_ratio/max": 1.5278619527816772, "sampling/importance_sampling_ratio/mean": 0.6760427355766296, "sampling/importance_sampling_ratio/min": 3.040245388774565e-08, "sampling/sampling_logp_difference/max": 2.287198543548584, "sampling/sampling_logp_difference/mean": 0.3436537981033325, "step": 659, "step_time": 24.362799990019994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010937500046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.7134375721216202, "epoch": 0.0132, "grad_norm": 0.09320656210184097, "kl": 0.49372249096632004, "learning_rate": 7.999423166698238e-06, "loss": -0.0556, "step": 660, "step_time": 12.787170721945586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.3125, "completions/mean_terminated_length": 5.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3518589437007904, "epoch": 0.01322, "frac_reward_zero_std": 0.125, "grad_norm": 0.13314208388328552, "kl": 0.6289276704192162, "learning_rate": 7.999421316451612e-06, "loss": -0.0621, "num_tokens": 16175753.0, "reward": 0.566811203956604, "reward_std": 0.8669345378875732, "rewards/rollout_reward_func/mean": 0.566811203956604, "rewards/rollout_reward_func/std": 0.8669345378875732, "sampling/importance_sampling_ratio/max": 1.2156310081481934, "sampling/importance_sampling_ratio/mean": 0.5333943367004395, "sampling/importance_sampling_ratio/min": 1.9322831576573662e-05, "sampling/sampling_logp_difference/max": 2.1104536056518555, "sampling/sampling_logp_difference/mean": 0.38163280487060547, "step": 661, "step_time": 29.35364477400435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.356599271297455, "epoch": 0.01324, "grad_norm": 0.12718363106250763, "kl": 0.5876485630869865, "learning_rate": 7.9994194632426e-06, "loss": -0.0621, "step": 662, "step_time": 13.983055356016848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 4.703703880310059, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2614212036132812, "epoch": 0.01326, "frac_reward_zero_std": 0.375, "grad_norm": 0.05980461835861206, "kl": 0.4357745163142681, "learning_rate": 7.999417607071208e-06, "loss": -0.03, "num_tokens": 16230671.0, "reward": 0.8009196519851685, "reward_std": 0.9755838513374329, "rewards/rollout_reward_func/mean": 0.8009196519851685, "rewards/rollout_reward_func/std": 0.9755837917327881, "sampling/importance_sampling_ratio/max": 1.103499174118042, "sampling/importance_sampling_ratio/mean": 0.6982583403587341, "sampling/importance_sampling_ratio/min": 5.694036008208059e-05, "sampling/sampling_logp_difference/max": 1.8693811893463135, "sampling/sampling_logp_difference/mean": 0.23386327922344208, "step": 663, "step_time": 29.268705052003497 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.267343819141388, "epoch": 0.01328, "grad_norm": 0.03521811589598656, "kl": 0.40291281789541245, "learning_rate": 7.999415747937436e-06, "loss": -0.0301, "step": 664, "step_time": 14.251812504953705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.888888835906982, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0027058869600296, "epoch": 0.0133, "frac_reward_zero_std": 0.0, "grad_norm": 0.06046625226736069, "kl": 0.3768591806292534, "learning_rate": 7.999413885841285e-06, "loss": -0.0687, "num_tokens": 16296364.0, "reward": 0.31955528259277344, "reward_std": 0.8170309066772461, "rewards/rollout_reward_func/mean": 0.31955528259277344, "rewards/rollout_reward_func/std": 0.8170309066772461, "sampling/importance_sampling_ratio/max": 1.5773407220840454, "sampling/importance_sampling_ratio/mean": 0.7280118465423584, "sampling/importance_sampling_ratio/min": 9.037933068611892e-09, "sampling/sampling_logp_difference/max": 2.2600460052490234, "sampling/sampling_logp_difference/mean": 0.38173627853393555, "step": 665, "step_time": 29.90589888204704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.003906711935997, "epoch": 0.01332, "grad_norm": 0.06156401336193085, "kl": 0.36431392282247543, "learning_rate": 7.99941202078276e-06, "loss": -0.0687, "step": 666, "step_time": 14.888614549010526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 5.290322303771973, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2722155451774597, "epoch": 0.01334, "frac_reward_zero_std": 0.125, "grad_norm": 0.10454296320676804, "kl": 0.4884123057126999, "learning_rate": 7.999410152761859e-06, "loss": -0.0355, "num_tokens": 16351244.0, "reward": 0.7748913168907166, "reward_std": 0.7170933485031128, "rewards/rollout_reward_func/mean": 0.7748913168907166, "rewards/rollout_reward_func/std": 0.7170933485031128, "sampling/importance_sampling_ratio/max": 1.3271433115005493, "sampling/importance_sampling_ratio/mean": 0.7333921194076538, "sampling/importance_sampling_ratio/min": 0.00037038285518065095, "sampling/sampling_logp_difference/max": 1.8970824480056763, "sampling/sampling_logp_difference/mean": 0.2246384471654892, "step": 667, "step_time": 22.88444274602807 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 1.2703353762626648, "epoch": 0.01336, "grad_norm": 0.09138640016317368, "kl": 0.4401082843542099, "learning_rate": 7.999408281778585e-06, "loss": -0.0357, "step": 668, "step_time": 12.027084295026725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0746544003486633, "epoch": 0.01338, "frac_reward_zero_std": 0.375, "grad_norm": 0.07566967606544495, "kl": 0.16412027180194855, "learning_rate": 7.999406407832942e-06, "loss": -0.0418, "num_tokens": 16403035.0, "reward": 0.5396990776062012, "reward_std": 0.9460533261299133, "rewards/rollout_reward_func/mean": 0.5396990776062012, "rewards/rollout_reward_func/std": 0.9460533261299133, "sampling/importance_sampling_ratio/max": 1.1151859760284424, "sampling/importance_sampling_ratio/mean": 0.6864380240440369, "sampling/importance_sampling_ratio/min": 8.563441511455494e-09, "sampling/sampling_logp_difference/max": 2.1947855949401855, "sampling/sampling_logp_difference/mean": 0.3205423057079315, "step": 669, "step_time": 25.59264149199589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0723381638526917, "epoch": 0.0134, "grad_norm": 0.07582882791757584, "kl": 0.16365410946309566, "learning_rate": 7.99940453092493e-06, "loss": -0.0417, "step": 670, "step_time": 12.336247468017973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.11916683614254, "epoch": 0.01342, "frac_reward_zero_std": 0.0, "grad_norm": 0.10172202438116074, "kl": 0.40727490186691284, "learning_rate": 7.99940265105455e-06, "loss": -0.0435, "num_tokens": 16462982.0, "reward": 0.8674511909484863, "reward_std": 0.7666372656822205, "rewards/rollout_reward_func/mean": 0.8674511909484863, "rewards/rollout_reward_func/std": 0.7666372656822205, "sampling/importance_sampling_ratio/max": 1.4046941995620728, "sampling/importance_sampling_ratio/mean": 0.9383034706115723, "sampling/importance_sampling_ratio/min": 5.181162487133406e-05, "sampling/sampling_logp_difference/max": 1.359367847442627, "sampling/sampling_logp_difference/mean": 0.17986464500427246, "step": 671, "step_time": 24.612830526020844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1239020973443985, "epoch": 0.01344, "grad_norm": 0.10139390081167221, "kl": 0.39589958637952805, "learning_rate": 7.999400768221805e-06, "loss": -0.0437, "step": 672, "step_time": 12.200871197972447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 4.814815044403076, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.757671445608139, "epoch": 0.01346, "frac_reward_zero_std": 0.0, "grad_norm": 0.054689280688762665, "kl": 0.23290357738733292, "learning_rate": 7.999398882426698e-06, "loss": -0.0645, "num_tokens": 16521691.0, "reward": 0.5797893404960632, "reward_std": 0.8455713987350464, "rewards/rollout_reward_func/mean": 0.5797893404960632, "rewards/rollout_reward_func/std": 0.8455714583396912, "sampling/importance_sampling_ratio/max": 1.6713835000991821, "sampling/importance_sampling_ratio/mean": 0.7637728452682495, "sampling/importance_sampling_ratio/min": 2.243442395410966e-05, "sampling/sampling_logp_difference/max": 1.7426645755767822, "sampling/sampling_logp_difference/mean": 0.28566277027130127, "step": 673, "step_time": 29.276516335987253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7638892233371735, "epoch": 0.01348, "grad_norm": 0.05074802786111832, "kl": 0.2286323755979538, "learning_rate": 7.999396993669228e-06, "loss": -0.0646, "step": 674, "step_time": 14.359877321025124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.774193286895752, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2080763429403305, "epoch": 0.0135, "frac_reward_zero_std": 0.0, "grad_norm": 0.12284175306558609, "kl": 0.5513945519924164, "learning_rate": 7.9993951019494e-06, "loss": -0.0626, "num_tokens": 16576673.0, "reward": 0.7893291711807251, "reward_std": 0.710241973400116, "rewards/rollout_reward_func/mean": 0.7893291711807251, "rewards/rollout_reward_func/std": 0.7102419137954712, "sampling/importance_sampling_ratio/max": 1.2637943029403687, "sampling/importance_sampling_ratio/mean": 0.7028065919876099, "sampling/importance_sampling_ratio/min": 8.90728915692307e-05, "sampling/sampling_logp_difference/max": 1.9698823690414429, "sampling/sampling_logp_difference/mean": 0.2281273603439331, "step": 675, "step_time": 25.100739449000685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2088459730148315, "epoch": 0.01352, "grad_norm": 0.1169600635766983, "kl": 0.5349870398640633, "learning_rate": 7.999393207267215e-06, "loss": -0.0629, "step": 676, "step_time": 13.09910846495768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2401264607906342, "epoch": 0.01354, "frac_reward_zero_std": 0.0, "grad_norm": 0.1743566393852234, "kl": 0.24738755822181702, "learning_rate": 7.999391309622672e-06, "loss": -0.0214, "num_tokens": 16636562.0, "reward": 0.692796528339386, "reward_std": 0.7625296711921692, "rewards/rollout_reward_func/mean": 0.692796528339386, "rewards/rollout_reward_func/std": 0.7625296711921692, "sampling/importance_sampling_ratio/max": 1.2186381816864014, "sampling/importance_sampling_ratio/mean": 0.8099945187568665, "sampling/importance_sampling_ratio/min": 0.00023029375006444752, "sampling/sampling_logp_difference/max": 1.7980624437332153, "sampling/sampling_logp_difference/mean": 0.20910051465034485, "step": 677, "step_time": 27.097970200993586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.250289462506771, "epoch": 0.01356, "grad_norm": 0.17811301350593567, "kl": 0.2494223602116108, "learning_rate": 7.999389409015776e-06, "loss": -0.0215, "step": 678, "step_time": 14.347705453023082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.076635003089905, "epoch": 0.01358, "frac_reward_zero_std": 0.0, "grad_norm": 0.0977613553404808, "kl": 0.3036822974681854, "learning_rate": 7.99938750544653e-06, "loss": -0.0781, "num_tokens": 16681477.0, "reward": 0.274975061416626, "reward_std": 0.9397128820419312, "rewards/rollout_reward_func/mean": 0.274975061416626, "rewards/rollout_reward_func/std": 0.9397128224372864, "sampling/importance_sampling_ratio/max": 1.123954176902771, "sampling/importance_sampling_ratio/mean": 0.5591959357261658, "sampling/importance_sampling_ratio/min": 5.1302140491316095e-05, "sampling/sampling_logp_difference/max": 1.9663746356964111, "sampling/sampling_logp_difference/mean": 0.3541061580181122, "step": 679, "step_time": 21.0930260369787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.079708009958267, "epoch": 0.0136, "grad_norm": 0.11748799681663513, "kl": 0.28568436205387115, "learning_rate": 7.999385598914932e-06, "loss": -0.0779, "step": 680, "step_time": 10.257939430011902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 5.193548202514648, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5621777176856995, "epoch": 0.01362, "frac_reward_zero_std": 0.125, "grad_norm": 0.06339851766824722, "kl": 0.4140684977173805, "learning_rate": 7.999383689420985e-06, "loss": -0.0416, "num_tokens": 16735786.0, "reward": 0.3055589199066162, "reward_std": 0.6783041954040527, "rewards/rollout_reward_func/mean": 0.3055589199066162, "rewards/rollout_reward_func/std": 0.6783041954040527, "sampling/importance_sampling_ratio/max": 1.1897755861282349, "sampling/importance_sampling_ratio/mean": 0.6858989000320435, "sampling/importance_sampling_ratio/min": 0.00020000559743493795, "sampling/sampling_logp_difference/max": 2.236036539077759, "sampling/sampling_logp_difference/mean": 0.27301323413848877, "step": 681, "step_time": 24.51102351397276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5624827146530151, "epoch": 0.01364, "grad_norm": 0.06711132824420929, "kl": 0.4268855005502701, "learning_rate": 7.999381776964694e-06, "loss": -0.042, "step": 682, "step_time": 13.336265050020302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 5.115384578704834, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8751757442951202, "epoch": 0.01366, "frac_reward_zero_std": 0.125, "grad_norm": 0.16532322764396667, "kl": 0.32901254296302795, "learning_rate": 7.999379861546056e-06, "loss": -0.0733, "num_tokens": 16797529.0, "reward": 0.3510980010032654, "reward_std": 0.9097269773483276, "rewards/rollout_reward_func/mean": 0.3510980010032654, "rewards/rollout_reward_func/std": 0.9097269773483276, "sampling/importance_sampling_ratio/max": 1.5451489686965942, "sampling/importance_sampling_ratio/mean": 0.6941089630126953, "sampling/importance_sampling_ratio/min": 9.906368170220503e-09, "sampling/sampling_logp_difference/max": 3.0908079147338867, "sampling/sampling_logp_difference/mean": 0.33452650904655457, "step": 683, "step_time": 29.065172444010386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8773438930511475, "epoch": 0.01368, "grad_norm": 0.1632479727268219, "kl": 0.3337969481945038, "learning_rate": 7.999377943165078e-06, "loss": -0.0739, "step": 684, "step_time": 13.200431718985783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.733333587646484, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2608595192432404, "epoch": 0.0137, "frac_reward_zero_std": 0.125, "grad_norm": 0.14335913956165314, "kl": 0.24948252737522125, "learning_rate": 7.999376021821759e-06, "loss": -0.0534, "num_tokens": 16843901.0, "reward": 0.6379669904708862, "reward_std": 0.9162090420722961, "rewards/rollout_reward_func/mean": 0.6379669904708862, "rewards/rollout_reward_func/std": 0.9162090420722961, "sampling/importance_sampling_ratio/max": 1.5227077007293701, "sampling/importance_sampling_ratio/mean": 0.7802456021308899, "sampling/importance_sampling_ratio/min": 0.0011118879774585366, "sampling/sampling_logp_difference/max": 1.8695566654205322, "sampling/sampling_logp_difference/mean": 0.19981516897678375, "step": 685, "step_time": 23.361731597979087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2670235931873322, "epoch": 0.01372, "grad_norm": 0.139353409409523, "kl": 0.25737831741571426, "learning_rate": 7.9993740975161e-06, "loss": -0.0542, "step": 686, "step_time": 11.69770633499138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.678571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9915109276771545, "epoch": 0.01374, "frac_reward_zero_std": 0.0, "grad_norm": 0.1506863385438919, "kl": 0.49963168054819107, "learning_rate": 7.999372170248105e-06, "loss": -0.0749, "num_tokens": 16903064.0, "reward": 0.48855718970298767, "reward_std": 0.8251659274101257, "rewards/rollout_reward_func/mean": 0.48855718970298767, "rewards/rollout_reward_func/std": 0.8251659274101257, "sampling/importance_sampling_ratio/max": 1.178667664527893, "sampling/importance_sampling_ratio/mean": 0.6036441922187805, "sampling/importance_sampling_ratio/min": 2.343489171607871e-07, "sampling/sampling_logp_difference/max": 2.311244010925293, "sampling/sampling_logp_difference/mean": 0.42598503828048706, "step": 687, "step_time": 31.61924454200198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0033711791038513, "epoch": 0.01376, "grad_norm": 0.16011428833007812, "kl": 0.5371434390544891, "learning_rate": 7.999370240017773e-06, "loss": -0.0748, "step": 688, "step_time": 14.988474659970962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 4.5217390060424805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.098076492547989, "epoch": 0.01378, "frac_reward_zero_std": 0.0, "grad_norm": 0.1342589557170868, "kl": 0.15811911970376968, "learning_rate": 7.99936830682511e-06, "loss": -0.0757, "num_tokens": 16963038.0, "reward": 0.3904151916503906, "reward_std": 0.9621464014053345, "rewards/rollout_reward_func/mean": 0.3904151916503906, "rewards/rollout_reward_func/std": 0.9621464014053345, "sampling/importance_sampling_ratio/max": 1.3089417219161987, "sampling/importance_sampling_ratio/mean": 0.6827167272567749, "sampling/importance_sampling_ratio/min": 1.2110648128782486e-07, "sampling/sampling_logp_difference/max": 2.302891254425049, "sampling/sampling_logp_difference/mean": 0.32012027502059937, "step": 689, "step_time": 30.289584445999935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.096745878458023, "epoch": 0.0138, "grad_norm": 0.12916982173919678, "kl": 0.1604924611747265, "learning_rate": 7.999366370670116e-06, "loss": -0.0758, "step": 690, "step_time": 14.201757979026297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.33818319439888, "epoch": 0.01382, "frac_reward_zero_std": 0.125, "grad_norm": 0.12510991096496582, "kl": 0.3634220212697983, "learning_rate": 7.999364431552792e-06, "loss": -0.0589, "num_tokens": 17021314.0, "reward": 0.734168529510498, "reward_std": 0.8429166674613953, "rewards/rollout_reward_func/mean": 0.734168529510498, "rewards/rollout_reward_func/std": 0.84291672706604, "sampling/importance_sampling_ratio/max": 1.3802694082260132, "sampling/importance_sampling_ratio/mean": 0.8288866281509399, "sampling/importance_sampling_ratio/min": 7.387807272607461e-06, "sampling/sampling_logp_difference/max": 2.099377393722534, "sampling/sampling_logp_difference/mean": 0.2513716220855713, "step": 691, "step_time": 25.52263240600587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 1.3242818862199783, "epoch": 0.01384, "grad_norm": 0.09645528346300125, "kl": 0.3697939068078995, "learning_rate": 7.99936248947314e-06, "loss": -0.059, "step": 692, "step_time": 12.335857334983302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.6875, "completions/mean_terminated_length": 4.4347825050354, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0137244686484337, "epoch": 0.01386, "frac_reward_zero_std": 0.125, "grad_norm": 0.06007999926805496, "kl": 0.1994903851300478, "learning_rate": 7.999360544431164e-06, "loss": -0.0532, "num_tokens": 17084088.0, "reward": 0.5522299408912659, "reward_std": 0.8768727779388428, "rewards/rollout_reward_func/mean": 0.5522299408912659, "rewards/rollout_reward_func/std": 0.8768727779388428, "sampling/importance_sampling_ratio/max": 1.6458027362823486, "sampling/importance_sampling_ratio/mean": 0.7586766481399536, "sampling/importance_sampling_ratio/min": 6.419782749844671e-08, "sampling/sampling_logp_difference/max": 2.615490436553955, "sampling/sampling_logp_difference/mean": 0.4020835757255554, "step": 693, "step_time": 25.91260730000795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.01778806373477, "epoch": 0.01388, "grad_norm": 0.06127164140343666, "kl": 0.19730256125330925, "learning_rate": 7.999358596426864e-06, "loss": -0.0534, "step": 694, "step_time": 12.686210267012939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1833437979221344, "epoch": 0.0139, "frac_reward_zero_std": 0.125, "grad_norm": 0.11706759035587311, "kl": 0.7678949981927872, "learning_rate": 7.999356645460242e-06, "loss": -0.0855, "num_tokens": 17143029.0, "reward": 0.4215218424797058, "reward_std": 0.8089455366134644, "rewards/rollout_reward_func/mean": 0.4215218424797058, "rewards/rollout_reward_func/std": 0.8089455366134644, "sampling/importance_sampling_ratio/max": 1.5061910152435303, "sampling/importance_sampling_ratio/mean": 0.6473269462585449, "sampling/importance_sampling_ratio/min": 2.442989632811532e-08, "sampling/sampling_logp_difference/max": 2.3715286254882812, "sampling/sampling_logp_difference/mean": 0.3891761898994446, "step": 695, "step_time": 27.57805948096211 }, { "clip_ratio/high_max": 0.027529762126505375, "clip_ratio/high_mean": 0.013764881063252687, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013764881063252687, "entropy": 2.17380353808403, "epoch": 0.01392, "grad_norm": 0.10412117093801498, "kl": 0.8183070570230484, "learning_rate": 7.9993546915313e-06, "loss": -0.0858, "step": 696, "step_time": 13.613843940984225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.413793087005615, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5894187688827515, "epoch": 0.01394, "frac_reward_zero_std": 0.0, "grad_norm": 0.06211242079734802, "kl": 0.41784247756004333, "learning_rate": 7.999352734640043e-06, "loss": -0.0482, "num_tokens": 17207132.0, "reward": 0.559483528137207, "reward_std": 0.7618597149848938, "rewards/rollout_reward_func/mean": 0.559483528137207, "rewards/rollout_reward_func/std": 0.7618597149848938, "sampling/importance_sampling_ratio/max": 1.1933106184005737, "sampling/importance_sampling_ratio/mean": 0.7164065837860107, "sampling/importance_sampling_ratio/min": 8.724703548068646e-06, "sampling/sampling_logp_difference/max": 1.8703352212905884, "sampling/sampling_logp_difference/mean": 0.2799655497074127, "step": 697, "step_time": 29.85219303003396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5800342708826065, "epoch": 0.01396, "grad_norm": 0.06472023576498032, "kl": 0.4076628088951111, "learning_rate": 7.999350774786468e-06, "loss": -0.0483, "step": 698, "step_time": 14.890776774991537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.543226569890976, "epoch": 0.01398, "frac_reward_zero_std": 0.0, "grad_norm": 0.13360345363616943, "kl": 0.33571023121476173, "learning_rate": 7.99934881197058e-06, "loss": -0.0641, "num_tokens": 17268684.0, "reward": 0.32928726077079773, "reward_std": 0.8550252318382263, "rewards/rollout_reward_func/mean": 0.32928726077079773, "rewards/rollout_reward_func/std": 0.8550252318382263, "sampling/importance_sampling_ratio/max": 1.2295457124710083, "sampling/importance_sampling_ratio/mean": 0.6600989103317261, "sampling/importance_sampling_ratio/min": 0.0004380939935799688, "sampling/sampling_logp_difference/max": 1.49567711353302, "sampling/sampling_logp_difference/mean": 0.27587491273880005, "step": 699, "step_time": 27.904873348976253 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.5287295281887054, "epoch": 0.014, "grad_norm": 0.11487120389938354, "kl": 0.32770271599292755, "learning_rate": 7.99934684619238e-06, "loss": -0.0643, "step": 700, "step_time": 13.508862529997714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.568628281354904, "epoch": 0.01402, "frac_reward_zero_std": 0.125, "grad_norm": 0.18934455513954163, "kl": 0.4647462423890829, "learning_rate": 7.999344877451869e-06, "loss": -0.0663, "num_tokens": 17319887.0, "reward": 0.00036950409412384033, "reward_std": 0.8498594164848328, "rewards/rollout_reward_func/mean": 0.00036950409412384033, "rewards/rollout_reward_func/std": 0.8498594164848328, "sampling/importance_sampling_ratio/max": 1.2104798555374146, "sampling/importance_sampling_ratio/mean": 0.48690176010131836, "sampling/importance_sampling_ratio/min": 2.5448405722272582e-05, "sampling/sampling_logp_difference/max": 2.049243927001953, "sampling/sampling_logp_difference/mean": 0.4084184169769287, "step": 701, "step_time": 25.249239497992676 }, { "clip_ratio/high_max": 0.022767857182770967, "clip_ratio/high_mean": 0.011383928591385484, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011383928591385484, "entropy": 2.5633492469787598, "epoch": 0.01404, "grad_norm": 0.1613892912864685, "kl": 0.4399650748819113, "learning_rate": 7.99934290574905e-06, "loss": -0.0669, "step": 702, "step_time": 11.824890291027259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.90625, "completions/mean_terminated_length": 4.862069129943848, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6518320590257645, "epoch": 0.01406, "frac_reward_zero_std": 0.0, "grad_norm": 0.11315791308879852, "kl": 0.4621893912553787, "learning_rate": 7.999340931083928e-06, "loss": -0.0644, "num_tokens": 17377181.0, "reward": 0.6880566477775574, "reward_std": 0.7238656282424927, "rewards/rollout_reward_func/mean": 0.6880566477775574, "rewards/rollout_reward_func/std": 0.7238656282424927, "sampling/importance_sampling_ratio/max": 1.2814185619354248, "sampling/importance_sampling_ratio/mean": 0.8371703624725342, "sampling/importance_sampling_ratio/min": 1.0440881936801816e-08, "sampling/sampling_logp_difference/max": 2.145846366882324, "sampling/sampling_logp_difference/mean": 0.30279844999313354, "step": 703, "step_time": 26.82516143101384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6433797180652618, "epoch": 0.01408, "grad_norm": 0.11198839545249939, "kl": 0.46478037536144257, "learning_rate": 7.999338953456498e-06, "loss": -0.0645, "step": 704, "step_time": 14.353119907988003 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4224929809570312, "epoch": 0.0141, "frac_reward_zero_std": 0.375, "grad_norm": 0.03458398953080177, "kl": 0.471550814807415, "learning_rate": 7.99933697286677e-06, "loss": -0.0315, "num_tokens": 17427789.0, "reward": 0.7644264698028564, "reward_std": 0.7222436666488647, "rewards/rollout_reward_func/mean": 0.7644264698028564, "rewards/rollout_reward_func/std": 0.7222436666488647, "sampling/importance_sampling_ratio/max": 1.1718958616256714, "sampling/importance_sampling_ratio/mean": 0.8303184509277344, "sampling/importance_sampling_ratio/min": 3.208966772860755e-11, "sampling/sampling_logp_difference/max": 2.155276298522949, "sampling/sampling_logp_difference/mean": 0.26329663395881653, "step": 705, "step_time": 24.389433061965974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4285636842250824, "epoch": 0.01412, "grad_norm": 0.08836735039949417, "kl": 0.4649359732866287, "learning_rate": 7.999334989314738e-06, "loss": -0.0313, "step": 706, "step_time": 12.557983080012491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1799347400665283, "epoch": 0.01414, "frac_reward_zero_std": 0.125, "grad_norm": 0.09565088152885437, "kl": 0.708005741238594, "learning_rate": 7.999333002800408e-06, "loss": -0.0703, "num_tokens": 17490057.0, "reward": 0.15050841867923737, "reward_std": 0.8528805375099182, "rewards/rollout_reward_func/mean": 0.15050841867923737, "rewards/rollout_reward_func/std": 0.8528804779052734, "sampling/importance_sampling_ratio/max": 1.3188742399215698, "sampling/importance_sampling_ratio/mean": 0.5313182473182678, "sampling/importance_sampling_ratio/min": 4.249790436006151e-06, "sampling/sampling_logp_difference/max": 2.016558885574341, "sampling/sampling_logp_difference/mean": 0.3501579761505127, "step": 707, "step_time": 29.92753154897946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1947686672210693, "epoch": 0.01416, "grad_norm": 0.08989281952381134, "kl": 0.6200457848608494, "learning_rate": 7.999331013323784e-06, "loss": -0.0705, "step": 708, "step_time": 14.540901341038989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.045454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4249810576438904, "epoch": 0.01418, "frac_reward_zero_std": 0.0, "grad_norm": 0.16278958320617676, "kl": 0.383496779948473, "learning_rate": 7.999329020884865e-06, "loss": -0.0802, "num_tokens": 17551279.0, "reward": 0.3709806799888611, "reward_std": 0.9299173355102539, "rewards/rollout_reward_func/mean": 0.3709806799888611, "rewards/rollout_reward_func/std": 0.9299173951148987, "sampling/importance_sampling_ratio/max": 1.2056103944778442, "sampling/importance_sampling_ratio/mean": 0.604554295539856, "sampling/importance_sampling_ratio/min": 2.640611285187333e-07, "sampling/sampling_logp_difference/max": 2.4343953132629395, "sampling/sampling_logp_difference/mean": 0.3500370979309082, "step": 709, "step_time": 29.115403737028828 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0033783784601837397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007054849062114954, "entropy": 2.421648859977722, "epoch": 0.0142, "grad_norm": 0.13948969542980194, "kl": 0.38439582847058773, "learning_rate": 7.999327025483652e-06, "loss": -0.0808, "step": 710, "step_time": 13.564238318998832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.0625, "completions/mean_terminated_length": 4.956521987915039, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.426849365234375, "epoch": 0.01422, "frac_reward_zero_std": 0.125, "grad_norm": 0.1328951120376587, "kl": 0.25745492801070213, "learning_rate": 7.99932502712015e-06, "loss": -0.0579, "num_tokens": 17602729.0, "reward": 0.26201915740966797, "reward_std": 0.8989862203598022, "rewards/rollout_reward_func/mean": 0.26201915740966797, "rewards/rollout_reward_func/std": 0.8989862203598022, "sampling/importance_sampling_ratio/max": 1.3638890981674194, "sampling/importance_sampling_ratio/mean": 0.6362289786338806, "sampling/importance_sampling_ratio/min": 1.1746519135158451e-07, "sampling/sampling_logp_difference/max": 2.232583999633789, "sampling/sampling_logp_difference/mean": 0.3744901120662689, "step": 711, "step_time": 27.157076688017696 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.4057261645793915, "epoch": 0.01424, "grad_norm": 0.0784364715218544, "kl": 0.24359959177672863, "learning_rate": 7.99932302579436e-06, "loss": -0.0587, "step": 712, "step_time": 12.693729032966075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.360000133514404, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7928893566131592, "epoch": 0.01426, "frac_reward_zero_std": 0.25, "grad_norm": 0.13331812620162964, "kl": 0.380131758749485, "learning_rate": 7.999321021506282e-06, "loss": -0.0571, "num_tokens": 17658531.0, "reward": 0.42059361934661865, "reward_std": 0.8437795042991638, "rewards/rollout_reward_func/mean": 0.42059361934661865, "rewards/rollout_reward_func/std": 0.8437795042991638, "sampling/importance_sampling_ratio/max": 1.310134768486023, "sampling/importance_sampling_ratio/mean": 0.7778888940811157, "sampling/importance_sampling_ratio/min": 6.729809456373914e-07, "sampling/sampling_logp_difference/max": 1.9620013236999512, "sampling/sampling_logp_difference/mean": 0.27599912881851196, "step": 713, "step_time": 28.31578534297296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.779323160648346, "epoch": 0.01428, "grad_norm": 0.10126101970672607, "kl": 0.3715585917234421, "learning_rate": 7.999319014255922e-06, "loss": -0.0578, "step": 714, "step_time": 13.955167339008767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.407407283782959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5310346335172653, "epoch": 0.0143, "frac_reward_zero_std": 0.25, "grad_norm": 0.10431809723377228, "kl": 0.4310237616300583, "learning_rate": 7.999317004043278e-06, "loss": -0.0441, "num_tokens": 17706860.0, "reward": 0.5929617881774902, "reward_std": 0.8775584697723389, "rewards/rollout_reward_func/mean": 0.5929617881774902, "rewards/rollout_reward_func/std": 0.8775584697723389, "sampling/importance_sampling_ratio/max": 1.2411744594573975, "sampling/importance_sampling_ratio/mean": 0.8266808986663818, "sampling/importance_sampling_ratio/min": 2.0255441768313176e-09, "sampling/sampling_logp_difference/max": 2.5470547676086426, "sampling/sampling_logp_difference/mean": 0.3210848569869995, "step": 715, "step_time": 25.526217639970127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5243651643395424, "epoch": 0.01432, "grad_norm": 0.10019469261169434, "kl": 0.46498122438788414, "learning_rate": 7.999314990868354e-06, "loss": -0.0439, "step": 716, "step_time": 12.552166604989907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.200000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0157822743058205, "epoch": 0.01434, "frac_reward_zero_std": 0.375, "grad_norm": 0.17052558064460754, "kl": 0.3723825663328171, "learning_rate": 7.999312974731151e-06, "loss": -0.0331, "num_tokens": 17757923.0, "reward": 0.7717713117599487, "reward_std": 0.7466451525688171, "rewards/rollout_reward_func/mean": 0.7717713117599487, "rewards/rollout_reward_func/std": 0.7466450929641724, "sampling/importance_sampling_ratio/max": 1.2575513124465942, "sampling/importance_sampling_ratio/mean": 0.9669553637504578, "sampling/importance_sampling_ratio/min": 1.0294724233972374e-05, "sampling/sampling_logp_difference/max": 1.8206748962402344, "sampling/sampling_logp_difference/mean": 0.19160564243793488, "step": 717, "step_time": 25.386952729983022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.016843929886818, "epoch": 0.01436, "grad_norm": 0.2187725156545639, "kl": 0.3578209616243839, "learning_rate": 7.999310955631672e-06, "loss": -0.0336, "step": 718, "step_time": 13.957600373018067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.448276042938232, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1477535516023636, "epoch": 0.01438, "frac_reward_zero_std": 0.125, "grad_norm": 0.1380959153175354, "kl": 0.286371823400259, "learning_rate": 7.999308933569918e-06, "loss": -0.031, "num_tokens": 17811528.0, "reward": 0.8086410164833069, "reward_std": 0.8029468059539795, "rewards/rollout_reward_func/mean": 0.8086410164833069, "rewards/rollout_reward_func/std": 0.8029466867446899, "sampling/importance_sampling_ratio/max": 1.317273497581482, "sampling/importance_sampling_ratio/mean": 0.9220717549324036, "sampling/importance_sampling_ratio/min": 2.609478713111457e-07, "sampling/sampling_logp_difference/max": 2.284335136413574, "sampling/sampling_logp_difference/mean": 0.25079888105392456, "step": 719, "step_time": 23.456427394005004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1568533852696419, "epoch": 0.0144, "grad_norm": 0.14759577810764313, "kl": 0.28121136128902435, "learning_rate": 7.99930690854589e-06, "loss": -0.0315, "step": 720, "step_time": 12.227263714012224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.507460802793503, "epoch": 0.01442, "frac_reward_zero_std": 0.25, "grad_norm": 0.0833292230963707, "kl": 0.6269728131592274, "learning_rate": 7.999304880559595e-06, "loss": -0.0449, "num_tokens": 17869649.0, "reward": 0.5198869705200195, "reward_std": 0.899296224117279, "rewards/rollout_reward_func/mean": 0.5198869705200195, "rewards/rollout_reward_func/std": 0.8992962837219238, "sampling/importance_sampling_ratio/max": 1.1639411449432373, "sampling/importance_sampling_ratio/mean": 0.6428179144859314, "sampling/importance_sampling_ratio/min": 3.242043744577927e-09, "sampling/sampling_logp_difference/max": 2.1817214488983154, "sampling/sampling_logp_difference/mean": 0.45491886138916016, "step": 721, "step_time": 25.774408463999862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5090632140636444, "epoch": 0.01444, "grad_norm": 0.08036825805902481, "kl": 0.5626260489225388, "learning_rate": 7.99930284961103e-06, "loss": -0.0449, "step": 722, "step_time": 13.108537731983233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8462712466716766, "epoch": 0.01446, "frac_reward_zero_std": 0.0, "grad_norm": 0.10278814285993576, "kl": 0.5963911935687065, "learning_rate": 7.999300815700199e-06, "loss": -0.0648, "num_tokens": 17928988.0, "reward": 0.250205934047699, "reward_std": 0.6882691979408264, "rewards/rollout_reward_func/mean": 0.250205934047699, "rewards/rollout_reward_func/std": 0.6882691383361816, "sampling/importance_sampling_ratio/max": 1.2484499216079712, "sampling/importance_sampling_ratio/mean": 0.800152063369751, "sampling/importance_sampling_ratio/min": 7.210309149741079e-07, "sampling/sampling_logp_difference/max": 1.8153574466705322, "sampling/sampling_logp_difference/mean": 0.3423278033733368, "step": 723, "step_time": 27.566274197015446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8544347882270813, "epoch": 0.01448, "grad_norm": 0.09332986176013947, "kl": 0.5510276593267918, "learning_rate": 7.9992987788271e-06, "loss": -0.0654, "step": 724, "step_time": 14.391434779012343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.84375, "completions/mean_terminated_length": 5.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.022220015525818, "epoch": 0.0145, "frac_reward_zero_std": 0.0, "grad_norm": 0.15625011920928955, "kl": 0.2560197189450264, "learning_rate": 7.999296738991744e-06, "loss": -0.039, "num_tokens": 17981154.0, "reward": 0.21662664413452148, "reward_std": 0.8879714608192444, "rewards/rollout_reward_func/mean": 0.21662664413452148, "rewards/rollout_reward_func/std": 0.8879714012145996, "sampling/importance_sampling_ratio/max": 1.4891878366470337, "sampling/importance_sampling_ratio/mean": 0.7405828237533569, "sampling/importance_sampling_ratio/min": 7.842297833349221e-08, "sampling/sampling_logp_difference/max": 2.0942630767822266, "sampling/sampling_logp_difference/mean": 0.31601911783218384, "step": 725, "step_time": 29.621050568966893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0295528769493103, "epoch": 0.01452, "grad_norm": 0.16116486489772797, "kl": 0.24708661809563637, "learning_rate": 7.999294696194125e-06, "loss": -0.0397, "step": 726, "step_time": 12.984508451016154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 6.2068963050842285, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.192274510860443, "epoch": 0.01454, "frac_reward_zero_std": 0.0, "grad_norm": 0.06155189871788025, "kl": 0.3847679942846298, "learning_rate": 7.999292650434247e-06, "loss": -0.0996, "num_tokens": 18031163.0, "reward": 0.7351219654083252, "reward_std": 0.8448290824890137, "rewards/rollout_reward_func/mean": 0.7351219654083252, "rewards/rollout_reward_func/std": 0.8448291420936584, "sampling/importance_sampling_ratio/max": 1.3216947317123413, "sampling/importance_sampling_ratio/mean": 0.7245052456855774, "sampling/importance_sampling_ratio/min": 4.2938228261846234e-07, "sampling/sampling_logp_difference/max": 2.317894697189331, "sampling/sampling_logp_difference/mean": 0.4061012864112854, "step": 727, "step_time": 24.752439564006636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2028402388095856, "epoch": 0.01456, "grad_norm": 0.060689784586429596, "kl": 0.3803408518433571, "learning_rate": 7.999290601712113e-06, "loss": -0.0995, "step": 728, "step_time": 12.128386871976545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.519999980926514, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.212666153907776, "epoch": 0.01458, "frac_reward_zero_std": 0.125, "grad_norm": 0.11765348166227341, "kl": 1.4058906510472298, "learning_rate": 7.999288550027724e-06, "loss": -0.0752, "num_tokens": 18087217.0, "reward": 0.2162661850452423, "reward_std": 0.8507146835327148, "rewards/rollout_reward_func/mean": 0.2162661850452423, "rewards/rollout_reward_func/std": 0.8507147431373596, "sampling/importance_sampling_ratio/max": 1.307538628578186, "sampling/importance_sampling_ratio/mean": 0.5958893299102783, "sampling/importance_sampling_ratio/min": 3.2140451367013156e-07, "sampling/sampling_logp_difference/max": 2.0819859504699707, "sampling/sampling_logp_difference/mean": 0.42757558822631836, "step": 729, "step_time": 27.640374248003354 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 2.2208016216754913, "epoch": 0.0146, "grad_norm": 0.09527803957462311, "kl": 1.2146642208099365, "learning_rate": 7.999286495381083e-06, "loss": -0.0758, "step": 730, "step_time": 13.304590400948655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.46875, "completions/mean_terminated_length": 5.045454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.117679685354233, "epoch": 0.01462, "frac_reward_zero_std": 0.125, "grad_norm": 0.06999200582504272, "kl": 0.20500214770436287, "learning_rate": 7.999284437772193e-06, "loss": -0.0422, "num_tokens": 18137987.0, "reward": 0.47967302799224854, "reward_std": 1.0173166990280151, "rewards/rollout_reward_func/mean": 0.47967302799224854, "rewards/rollout_reward_func/std": 1.0173165798187256, "sampling/importance_sampling_ratio/max": 1.2055470943450928, "sampling/importance_sampling_ratio/mean": 0.6800398826599121, "sampling/importance_sampling_ratio/min": 2.9790086841785524e-08, "sampling/sampling_logp_difference/max": 2.220269203186035, "sampling/sampling_logp_difference/mean": 0.4138941168785095, "step": 731, "step_time": 30.549048745975597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.117747277021408, "epoch": 0.01464, "grad_norm": 0.06570176780223846, "kl": 0.20196078717708588, "learning_rate": 7.999282377201053e-06, "loss": -0.0424, "step": 732, "step_time": 14.372704008012079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 4.961538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.272334545850754, "epoch": 0.01466, "frac_reward_zero_std": 0.25, "grad_norm": 0.1674337238073349, "kl": 0.2299179919064045, "learning_rate": 7.999280313667668e-06, "loss": -0.0513, "num_tokens": 18188520.0, "reward": 0.6529377102851868, "reward_std": 0.8790546655654907, "rewards/rollout_reward_func/mean": 0.6529377102851868, "rewards/rollout_reward_func/std": 0.8790546655654907, "sampling/importance_sampling_ratio/max": 1.385277271270752, "sampling/importance_sampling_ratio/mean": 0.7715776562690735, "sampling/importance_sampling_ratio/min": 1.5306474976384266e-09, "sampling/sampling_logp_difference/max": 2.6596531867980957, "sampling/sampling_logp_difference/mean": 0.4383412003517151, "step": 733, "step_time": 25.752542200003518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2698566019535065, "epoch": 0.01468, "grad_norm": 0.18279992043972015, "kl": 0.22520620003342628, "learning_rate": 7.999278247172039e-06, "loss": -0.0519, "step": 734, "step_time": 12.789166647999082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1495928280055523, "epoch": 0.0147, "frac_reward_zero_std": 0.25, "grad_norm": 0.09975752234458923, "kl": 0.2813614457845688, "learning_rate": 7.999276177714167e-06, "loss": -0.0197, "num_tokens": 18241880.0, "reward": 0.7233462333679199, "reward_std": 0.8105880618095398, "rewards/rollout_reward_func/mean": 0.7233462333679199, "rewards/rollout_reward_func/std": 0.8105880618095398, "sampling/importance_sampling_ratio/max": 1.2225984334945679, "sampling/importance_sampling_ratio/mean": 0.907227635383606, "sampling/importance_sampling_ratio/min": 2.654602155871544e-07, "sampling/sampling_logp_difference/max": 2.1631855964660645, "sampling/sampling_logp_difference/mean": 0.20547008514404297, "step": 735, "step_time": 27.970555565989343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1581898853182793, "epoch": 0.01472, "grad_norm": 0.10456065833568573, "kl": 0.28390127047896385, "learning_rate": 7.999274105294054e-06, "loss": -0.0201, "step": 736, "step_time": 15.450094216997968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.047014743089676, "epoch": 0.01474, "frac_reward_zero_std": 0.0, "grad_norm": 0.12893915176391602, "kl": 0.31961856223642826, "learning_rate": 7.999272029911705e-06, "loss": -0.0869, "num_tokens": 18296419.0, "reward": 0.38390448689460754, "reward_std": 0.8872663974761963, "rewards/rollout_reward_func/mean": 0.38390448689460754, "rewards/rollout_reward_func/std": 0.8872663974761963, "sampling/importance_sampling_ratio/max": 1.4225919246673584, "sampling/importance_sampling_ratio/mean": 0.7152419090270996, "sampling/importance_sampling_ratio/min": 4.791019455296919e-05, "sampling/sampling_logp_difference/max": 1.8585035800933838, "sampling/sampling_logp_difference/mean": 0.3239664137363434, "step": 737, "step_time": 25.946549560001586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.04714635014534, "epoch": 0.01476, "grad_norm": 0.1312146633863449, "kl": 0.30682381615042686, "learning_rate": 7.999269951567118e-06, "loss": -0.0866, "step": 738, "step_time": 13.587339391990099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0519711673259735, "epoch": 0.01478, "frac_reward_zero_std": 0.125, "grad_norm": 0.13557514548301697, "kl": 0.33694474399089813, "learning_rate": 7.999267870260298e-06, "loss": -0.0602, "num_tokens": 18355183.0, "reward": 0.2807723581790924, "reward_std": 0.8466700911521912, "rewards/rollout_reward_func/mean": 0.2807723581790924, "rewards/rollout_reward_func/std": 0.8466700911521912, "sampling/importance_sampling_ratio/max": 2.1756677627563477, "sampling/importance_sampling_ratio/mean": 0.7141172289848328, "sampling/importance_sampling_ratio/min": 4.338721282692859e-06, "sampling/sampling_logp_difference/max": 1.5904512405395508, "sampling/sampling_logp_difference/mean": 0.3014373779296875, "step": 739, "step_time": 30.497238386014942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.056706577539444, "epoch": 0.0148, "grad_norm": 0.13919110596179962, "kl": 0.35529975965619087, "learning_rate": 7.999265785991244e-06, "loss": -0.0606, "step": 740, "step_time": 14.824085364001803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3995521664619446, "epoch": 0.01482, "frac_reward_zero_std": 0.125, "grad_norm": 0.11583591997623444, "kl": 0.6995282433927059, "learning_rate": 7.999263698759964e-06, "loss": -0.0455, "num_tokens": 18405125.0, "reward": 0.1753007173538208, "reward_std": 0.9015342593193054, "rewards/rollout_reward_func/mean": 0.1753007173538208, "rewards/rollout_reward_func/std": 0.9015341997146606, "sampling/importance_sampling_ratio/max": 1.426374912261963, "sampling/importance_sampling_ratio/mean": 0.8347722291946411, "sampling/importance_sampling_ratio/min": 4.377164714242099e-06, "sampling/sampling_logp_difference/max": 2.1944069862365723, "sampling/sampling_logp_difference/mean": 0.23050710558891296, "step": 741, "step_time": 25.242942068027332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.396638661623001, "epoch": 0.01484, "grad_norm": 0.10957717150449753, "kl": 0.6667241584509611, "learning_rate": 7.999261608566454e-06, "loss": -0.0458, "step": 742, "step_time": 12.940481143043144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2367215156555176, "epoch": 0.01486, "frac_reward_zero_std": 0.0, "grad_norm": 0.052890580147504807, "kl": 0.45344195514917374, "learning_rate": 7.999259515410718e-06, "loss": -0.0528, "num_tokens": 18465524.0, "reward": 0.4255617558956146, "reward_std": 0.8257405161857605, "rewards/rollout_reward_func/mean": 0.4255617558956146, "rewards/rollout_reward_func/std": 0.8257404565811157, "sampling/importance_sampling_ratio/max": 1.4020661115646362, "sampling/importance_sampling_ratio/mean": 0.742242693901062, "sampling/importance_sampling_ratio/min": 9.530915122013539e-05, "sampling/sampling_logp_difference/max": 1.9488604068756104, "sampling/sampling_logp_difference/mean": 0.3469865620136261, "step": 743, "step_time": 27.23138408502564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.239260733127594, "epoch": 0.01488, "grad_norm": 0.0533529669046402, "kl": 0.43749789521098137, "learning_rate": 7.999257419292758e-06, "loss": -0.0529, "step": 744, "step_time": 12.996136585017666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.039134055376053, "epoch": 0.0149, "frac_reward_zero_std": 0.125, "grad_norm": 0.09669248759746552, "kl": 0.4941076636314392, "learning_rate": 7.999255320212578e-06, "loss": -0.0614, "num_tokens": 18524621.0, "reward": 0.46437275409698486, "reward_std": 0.824779212474823, "rewards/rollout_reward_func/mean": 0.46437275409698486, "rewards/rollout_reward_func/std": 0.8247791528701782, "sampling/importance_sampling_ratio/max": 1.5435415506362915, "sampling/importance_sampling_ratio/mean": 0.7781914472579956, "sampling/importance_sampling_ratio/min": 6.248455520108109e-06, "sampling/sampling_logp_difference/max": 2.4601120948791504, "sampling/sampling_logp_difference/mean": 0.3466167747974396, "step": 745, "step_time": 27.892053203016985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.038052558898926, "epoch": 0.01492, "grad_norm": 0.09713956713676453, "kl": 0.4988078251481056, "learning_rate": 7.999253218170178e-06, "loss": -0.0616, "step": 746, "step_time": 13.599348883959465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8928766548633575, "epoch": 0.01494, "frac_reward_zero_std": 0.0, "grad_norm": 0.17764507234096527, "kl": 0.294567471370101, "learning_rate": 7.99925111316556e-06, "loss": -0.0682, "num_tokens": 18581959.0, "reward": 0.3758199214935303, "reward_std": 0.8643825650215149, "rewards/rollout_reward_func/mean": 0.3758199214935303, "rewards/rollout_reward_func/std": 0.8643825054168701, "sampling/importance_sampling_ratio/max": 1.1602455377578735, "sampling/importance_sampling_ratio/mean": 0.6416795253753662, "sampling/importance_sampling_ratio/min": 1.9423002584062488e-07, "sampling/sampling_logp_difference/max": 1.9974576234817505, "sampling/sampling_logp_difference/mean": 0.33954018354415894, "step": 747, "step_time": 28.719597340998007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8886485993862152, "epoch": 0.01496, "grad_norm": 0.1698876917362213, "kl": 0.3158133253455162, "learning_rate": 7.999249005198726e-06, "loss": -0.0685, "step": 748, "step_time": 13.753892866981914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7669428288936615, "epoch": 0.01498, "frac_reward_zero_std": 0.25, "grad_norm": 0.028594229370355606, "kl": 0.6731907539069653, "learning_rate": 7.99924689426968e-06, "loss": -0.0631, "num_tokens": 18631845.0, "reward": 0.6901145577430725, "reward_std": 0.8686864376068115, "rewards/rollout_reward_func/mean": 0.6901145577430725, "rewards/rollout_reward_func/std": 0.8686864376068115, "sampling/importance_sampling_ratio/max": 1.252240538597107, "sampling/importance_sampling_ratio/mean": 0.7813502550125122, "sampling/importance_sampling_ratio/min": 6.773455424990971e-06, "sampling/sampling_logp_difference/max": 1.8636584281921387, "sampling/sampling_logp_difference/mean": 0.2860270142555237, "step": 749, "step_time": 29.997398353996687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7626339197158813, "epoch": 0.015, "grad_norm": 0.02641717903316021, "kl": 0.6500725299119949, "learning_rate": 7.999244780378422e-06, "loss": -0.0632, "step": 750, "step_time": 16.20676132198423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6037218421697617, "epoch": 0.01502, "frac_reward_zero_std": 0.125, "grad_norm": 0.15423473715782166, "kl": 0.545834444463253, "learning_rate": 7.999242663524954e-06, "loss": -0.0755, "num_tokens": 18689575.0, "reward": 0.4989456236362457, "reward_std": 0.8048995137214661, "rewards/rollout_reward_func/mean": 0.4989456236362457, "rewards/rollout_reward_func/std": 0.8048995733261108, "sampling/importance_sampling_ratio/max": 1.3795603513717651, "sampling/importance_sampling_ratio/mean": 0.7897645235061646, "sampling/importance_sampling_ratio/min": 6.619913165195612e-06, "sampling/sampling_logp_difference/max": 1.7956295013427734, "sampling/sampling_logp_difference/mean": 0.26877179741859436, "step": 751, "step_time": 28.277961452986347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.591916710138321, "epoch": 0.01504, "grad_norm": 0.1365826576948166, "kl": 0.571337878704071, "learning_rate": 7.99924054370928e-06, "loss": -0.0759, "step": 752, "step_time": 14.885367073002271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0711532682180405, "epoch": 0.01506, "frac_reward_zero_std": 0.5, "grad_norm": 0.03472145274281502, "kl": 0.6900215223431587, "learning_rate": 7.999238420931402e-06, "loss": -0.0439, "num_tokens": 18738905.0, "reward": 0.6913614273071289, "reward_std": 0.8023927807807922, "rewards/rollout_reward_func/mean": 0.6913614273071289, "rewards/rollout_reward_func/std": 0.8023927211761475, "sampling/importance_sampling_ratio/max": 1.160366415977478, "sampling/importance_sampling_ratio/mean": 0.8394909501075745, "sampling/importance_sampling_ratio/min": 1.6609843441983685e-05, "sampling/sampling_logp_difference/max": 2.26340913772583, "sampling/sampling_logp_difference/mean": 0.22321049869060516, "step": 753, "step_time": 25.582483445003163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0657307505607605, "epoch": 0.01508, "grad_norm": 0.028128065168857574, "kl": 0.6876601874828339, "learning_rate": 7.999236295191319e-06, "loss": -0.044, "step": 754, "step_time": 13.816160204994958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9715510159730911, "epoch": 0.0151, "frac_reward_zero_std": 0.25, "grad_norm": 0.031102925539016724, "kl": 0.32522811740636826, "learning_rate": 7.999234166489036e-06, "loss": -0.0528, "num_tokens": 18788151.0, "reward": 0.7043464779853821, "reward_std": 0.8573060035705566, "rewards/rollout_reward_func/mean": 0.7043464779853821, "rewards/rollout_reward_func/std": 0.8573059439659119, "sampling/importance_sampling_ratio/max": 1.406122088432312, "sampling/importance_sampling_ratio/mean": 0.8775302171707153, "sampling/importance_sampling_ratio/min": 0.001202683080919087, "sampling/sampling_logp_difference/max": 1.6594324111938477, "sampling/sampling_logp_difference/mean": 0.17691701650619507, "step": 755, "step_time": 26.292430812027305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9682820439338684, "epoch": 0.01512, "grad_norm": 0.032323937863111496, "kl": 0.34157153591513634, "learning_rate": 7.999232034824556e-06, "loss": -0.0527, "step": 756, "step_time": 13.173796071961988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6455449163913727, "epoch": 0.01514, "frac_reward_zero_std": 0.125, "grad_norm": 0.06538223475217819, "kl": 0.543352298438549, "learning_rate": 7.999229900197879e-06, "loss": -0.0597, "num_tokens": 18844038.0, "reward": 0.6316707134246826, "reward_std": 0.8370941281318665, "rewards/rollout_reward_func/mean": 0.6316707134246826, "rewards/rollout_reward_func/std": 0.8370940685272217, "sampling/importance_sampling_ratio/max": 1.4709327220916748, "sampling/importance_sampling_ratio/mean": 0.8097854852676392, "sampling/importance_sampling_ratio/min": 9.373871989737381e-08, "sampling/sampling_logp_difference/max": 2.587655544281006, "sampling/sampling_logp_difference/mean": 0.3335694968700409, "step": 757, "step_time": 25.024457627005177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6439728140830994, "epoch": 0.01516, "grad_norm": 0.06320173293352127, "kl": 0.5692086219787598, "learning_rate": 7.999227762609006e-06, "loss": -0.0599, "step": 758, "step_time": 13.044874502986204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 4.559999942779541, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9811437129974365, "epoch": 0.01518, "frac_reward_zero_std": 0.25, "grad_norm": 0.03292161598801613, "kl": 0.25159482657909393, "learning_rate": 7.999225622057942e-06, "loss": -0.0467, "num_tokens": 18897880.0, "reward": 0.7407671213150024, "reward_std": 0.8392125964164734, "rewards/rollout_reward_func/mean": 0.7407671213150024, "rewards/rollout_reward_func/std": 0.8392125964164734, "sampling/importance_sampling_ratio/max": 1.2260981798171997, "sampling/importance_sampling_ratio/mean": 0.7770178318023682, "sampling/importance_sampling_ratio/min": 2.601476722929874e-08, "sampling/sampling_logp_difference/max": 2.304868221282959, "sampling/sampling_logp_difference/mean": 0.3253594934940338, "step": 759, "step_time": 26.489022984023904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9804045855998993, "epoch": 0.0152, "grad_norm": 0.031448956578969955, "kl": 0.2540722843259573, "learning_rate": 7.999223478544686e-06, "loss": -0.0468, "step": 760, "step_time": 13.13521650704206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0687467604875565, "epoch": 0.01522, "frac_reward_zero_std": 0.25, "grad_norm": 0.13268257677555084, "kl": 0.2668837904930115, "learning_rate": 7.999221332069244e-06, "loss": -0.0385, "num_tokens": 18948382.0, "reward": 0.44404709339141846, "reward_std": 0.8262755870819092, "rewards/rollout_reward_func/mean": 0.44404709339141846, "rewards/rollout_reward_func/std": 0.8262755274772644, "sampling/importance_sampling_ratio/max": 1.4450170993804932, "sampling/importance_sampling_ratio/mean": 0.9416602849960327, "sampling/importance_sampling_ratio/min": 1.7362748621962965e-05, "sampling/sampling_logp_difference/max": 1.8689544200897217, "sampling/sampling_logp_difference/mean": 0.16500090062618256, "step": 761, "step_time": 26.447842450987082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.069167748093605, "epoch": 0.01524, "grad_norm": 0.1330908089876175, "kl": 0.2676663063466549, "learning_rate": 7.999219182631615e-06, "loss": -0.0387, "step": 762, "step_time": 13.204618070973083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.740740776062012, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8577495515346527, "epoch": 0.01526, "frac_reward_zero_std": 0.0, "grad_norm": 0.05130615457892418, "kl": 0.7375027313828468, "learning_rate": 7.999217030231803e-06, "loss": -0.078, "num_tokens": 19004565.0, "reward": 0.6310092210769653, "reward_std": 0.769889235496521, "rewards/rollout_reward_func/mean": 0.6310092210769653, "rewards/rollout_reward_func/std": 0.769889235496521, "sampling/importance_sampling_ratio/max": 1.2265764474868774, "sampling/importance_sampling_ratio/mean": 0.7124338746070862, "sampling/importance_sampling_ratio/min": 1.1657341758564144e-07, "sampling/sampling_logp_difference/max": 2.031458854675293, "sampling/sampling_logp_difference/mean": 0.37591132521629333, "step": 763, "step_time": 30.82927067001583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8636550903320312, "epoch": 0.01528, "grad_norm": 0.04853024333715439, "kl": 0.727660559117794, "learning_rate": 7.999214874869809e-06, "loss": -0.0781, "step": 764, "step_time": 15.287873329012655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 5.000000476837158, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2226705104112625, "epoch": 0.0153, "frac_reward_zero_std": 0.125, "grad_norm": 0.06599919497966766, "kl": 0.8245283327996731, "learning_rate": 7.999212716545634e-06, "loss": -0.0369, "num_tokens": 19055062.0, "reward": 0.7713268995285034, "reward_std": 0.7753386497497559, "rewards/rollout_reward_func/mean": 0.7713268995285034, "rewards/rollout_reward_func/std": 0.7753385305404663, "sampling/importance_sampling_ratio/max": 1.3083487749099731, "sampling/importance_sampling_ratio/mean": 0.7962034344673157, "sampling/importance_sampling_ratio/min": 4.835183062823489e-06, "sampling/sampling_logp_difference/max": 1.7658815383911133, "sampling/sampling_logp_difference/mean": 0.25557902455329895, "step": 765, "step_time": 28.07255999694462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.229104906320572, "epoch": 0.01532, "grad_norm": 0.06274139136075974, "kl": 0.7754690200090408, "learning_rate": 7.999210555259283e-06, "loss": -0.0371, "step": 766, "step_time": 14.322086451022187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.34375, "completions/mean_terminated_length": 4.633333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.209895759820938, "epoch": 0.01534, "frac_reward_zero_std": 0.25, "grad_norm": 0.0829334408044815, "kl": 0.6477821208536625, "learning_rate": 7.999208391010757e-06, "loss": -0.0245, "num_tokens": 19112124.0, "reward": 0.5755137205123901, "reward_std": 0.7584482431411743, "rewards/rollout_reward_func/mean": 0.5755137205123901, "rewards/rollout_reward_func/std": 0.7584482431411743, "sampling/importance_sampling_ratio/max": 1.4134849309921265, "sampling/importance_sampling_ratio/mean": 0.914160966873169, "sampling/importance_sampling_ratio/min": 6.917412520124344e-06, "sampling/sampling_logp_difference/max": 1.6922814846038818, "sampling/sampling_logp_difference/mean": 0.22845184803009033, "step": 767, "step_time": 28.463195822987473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2173609733581543, "epoch": 0.01536, "grad_norm": 0.08183560520410538, "kl": 0.6101386547088623, "learning_rate": 7.999206223800056e-06, "loss": -0.0246, "step": 768, "step_time": 15.054429075971711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.793103218078613, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5175889916718006, "epoch": 0.01538, "frac_reward_zero_std": 0.125, "grad_norm": 0.07207804173231125, "kl": 0.24002855271100998, "learning_rate": 7.999204053627186e-06, "loss": -0.0167, "num_tokens": 19170579.0, "reward": 0.794145405292511, "reward_std": 0.8337582349777222, "rewards/rollout_reward_func/mean": 0.794145405292511, "rewards/rollout_reward_func/std": 0.8337582349777222, "sampling/importance_sampling_ratio/max": 1.3248296976089478, "sampling/importance_sampling_ratio/mean": 0.9010328054428101, "sampling/importance_sampling_ratio/min": 1.7767493432074843e-07, "sampling/sampling_logp_difference/max": 2.2687530517578125, "sampling/sampling_logp_difference/mean": 0.3317946195602417, "step": 769, "step_time": 26.091636320983525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5331220496445894, "epoch": 0.0154, "grad_norm": 0.07856160402297974, "kl": 0.23618806153535843, "learning_rate": 7.999201880492145e-06, "loss": -0.0171, "step": 770, "step_time": 12.732732127973577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.7990600783377886, "epoch": 0.01542, "frac_reward_zero_std": 0.5, "grad_norm": 0.0941462516784668, "kl": 0.2843877896666527, "learning_rate": 7.999199704394939e-06, "loss": -0.0235, "num_tokens": 19216308.0, "reward": 1.2848081588745117, "reward_std": 0.5372204780578613, "rewards/rollout_reward_func/mean": 1.2848081588745117, "rewards/rollout_reward_func/std": 0.5372205376625061, "sampling/importance_sampling_ratio/max": 1.8642500638961792, "sampling/importance_sampling_ratio/mean": 1.0076746940612793, "sampling/importance_sampling_ratio/min": 0.0007263892330229282, "sampling/sampling_logp_difference/max": 1.9054100513458252, "sampling/sampling_logp_difference/mean": 0.13585582375526428, "step": 771, "step_time": 26.810201968997717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8139866404235363, "epoch": 0.01544, "grad_norm": 0.10202019661664963, "kl": 0.27822675183415413, "learning_rate": 7.999197525335568e-06, "loss": -0.0241, "step": 772, "step_time": 14.552557688002707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.375, "completions/mean_terminated_length": 4.275862216949463, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.291158214211464, "epoch": 0.01546, "frac_reward_zero_std": 0.25, "grad_norm": 0.042930830270051956, "kl": 0.25535261631011963, "learning_rate": 7.999195343314033e-06, "loss": -0.0465, "num_tokens": 19274716.0, "reward": 0.7905468940734863, "reward_std": 0.7799542546272278, "rewards/rollout_reward_func/mean": 0.7905468940734863, "rewards/rollout_reward_func/std": 0.7799542546272278, "sampling/importance_sampling_ratio/max": 1.2909189462661743, "sampling/importance_sampling_ratio/mean": 0.8929484486579895, "sampling/importance_sampling_ratio/min": 0.0003690074954647571, "sampling/sampling_logp_difference/max": 1.807523250579834, "sampling/sampling_logp_difference/mean": 0.22019247710704803, "step": 773, "step_time": 28.476314011990326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3132788836956024, "epoch": 0.01548, "grad_norm": 0.04181647673249245, "kl": 0.24410361051559448, "learning_rate": 7.99919315833034e-06, "loss": -0.0466, "step": 774, "step_time": 14.729159114969661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 5.655172348022461, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.243981182575226, "epoch": 0.0155, "frac_reward_zero_std": 0.25, "grad_norm": 0.07377155125141144, "kl": 0.37885425612330437, "learning_rate": 7.999190970384487e-06, "loss": -0.0702, "num_tokens": 19320428.0, "reward": 0.7283214330673218, "reward_std": 0.8159937262535095, "rewards/rollout_reward_func/mean": 0.7283214330673218, "rewards/rollout_reward_func/std": 0.8159936666488647, "sampling/importance_sampling_ratio/max": 1.19235360622406, "sampling/importance_sampling_ratio/mean": 0.7356761693954468, "sampling/importance_sampling_ratio/min": 8.774279081080749e-09, "sampling/sampling_logp_difference/max": 2.675973892211914, "sampling/sampling_logp_difference/mean": 0.41875314712524414, "step": 775, "step_time": 23.45882141898619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.252459704875946, "epoch": 0.01552, "grad_norm": 0.0744469091296196, "kl": 0.3412015065550804, "learning_rate": 7.99918877947648e-06, "loss": -0.0702, "step": 776, "step_time": 11.5848105349869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.03125, "completions/mean_terminated_length": 4.91304349899292, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2991663813591003, "epoch": 0.01554, "frac_reward_zero_std": 0.125, "grad_norm": 0.06983449310064316, "kl": 0.3172801751643419, "learning_rate": 7.999186585606318e-06, "loss": -0.0588, "num_tokens": 19380510.0, "reward": 0.6120060682296753, "reward_std": 0.9304050803184509, "rewards/rollout_reward_func/mean": 0.6120060682296753, "rewards/rollout_reward_func/std": 0.9304050803184509, "sampling/importance_sampling_ratio/max": 1.299410104751587, "sampling/importance_sampling_ratio/mean": 0.6379112005233765, "sampling/importance_sampling_ratio/min": 0.00012195529416203499, "sampling/sampling_logp_difference/max": 2.017178773880005, "sampling/sampling_logp_difference/mean": 0.34507668018341064, "step": 777, "step_time": 28.749747678026324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.302673250436783, "epoch": 0.01556, "grad_norm": 0.07162351906299591, "kl": 0.3145143687725067, "learning_rate": 7.999184388774004e-06, "loss": -0.059, "step": 778, "step_time": 13.824156474991469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2154460549354553, "epoch": 0.01558, "frac_reward_zero_std": 0.25, "grad_norm": 0.09934210777282715, "kl": 0.30205050110816956, "learning_rate": 7.999182188979541e-06, "loss": -0.0585, "num_tokens": 19436961.0, "reward": 0.4859515428543091, "reward_std": 0.8839541077613831, "rewards/rollout_reward_func/mean": 0.4859515428543091, "rewards/rollout_reward_func/std": 0.8839540481567383, "sampling/importance_sampling_ratio/max": 1.2167681455612183, "sampling/importance_sampling_ratio/mean": 0.6874486804008484, "sampling/importance_sampling_ratio/min": 5.11857756180234e-09, "sampling/sampling_logp_difference/max": 2.271188497543335, "sampling/sampling_logp_difference/mean": 0.4100865423679352, "step": 779, "step_time": 26.403815716999816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 2.217420056462288, "epoch": 0.0156, "grad_norm": 0.06871354579925537, "kl": 0.3070979844778776, "learning_rate": 7.999179986222931e-06, "loss": -0.059, "step": 780, "step_time": 12.932250653015217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4254990816116333, "epoch": 0.01562, "frac_reward_zero_std": 0.125, "grad_norm": 0.08582846075296402, "kl": 0.41609229147434235, "learning_rate": 7.999177780504174e-06, "loss": -0.0382, "num_tokens": 19486785.0, "reward": 0.22009940445423126, "reward_std": 0.9581897854804993, "rewards/rollout_reward_func/mean": 0.22009940445423126, "rewards/rollout_reward_func/std": 0.9581897258758545, "sampling/importance_sampling_ratio/max": 1.3157596588134766, "sampling/importance_sampling_ratio/mean": 0.606041431427002, "sampling/importance_sampling_ratio/min": 1.0311683581676334e-05, "sampling/sampling_logp_difference/max": 1.9851690530776978, "sampling/sampling_logp_difference/mean": 0.3732297420501709, "step": 781, "step_time": 30.202348171995254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004999999888241291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004999999888241291, "entropy": 2.439939320087433, "epoch": 0.01564, "grad_norm": 0.09227844327688217, "kl": 0.40214406326413155, "learning_rate": 7.999175571823275e-06, "loss": -0.0387, "step": 782, "step_time": 13.672805109992623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.75, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.455151855945587, "epoch": 0.01566, "frac_reward_zero_std": 0.125, "grad_norm": 0.07636115700006485, "kl": 0.18557412549853325, "learning_rate": 7.999173360180236e-06, "loss": -0.0758, "num_tokens": 19540322.0, "reward": 0.15948617458343506, "reward_std": 0.8789932131767273, "rewards/rollout_reward_func/mean": 0.15948617458343506, "rewards/rollout_reward_func/std": 0.8789932131767273, "sampling/importance_sampling_ratio/max": 1.2975945472717285, "sampling/importance_sampling_ratio/mean": 0.6395930051803589, "sampling/importance_sampling_ratio/min": 8.684415964133052e-10, "sampling/sampling_logp_difference/max": 2.2381656169891357, "sampling/sampling_logp_difference/mean": 0.38402634859085083, "step": 783, "step_time": 28.38014326899429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.462217301130295, "epoch": 0.01568, "grad_norm": 0.07993151992559433, "kl": 0.18299386650323868, "learning_rate": 7.999171145575059e-06, "loss": -0.0759, "step": 784, "step_time": 14.460540151019813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7217157781124115, "epoch": 0.0157, "frac_reward_zero_std": 0.125, "grad_norm": 0.09166143834590912, "kl": 0.35602346807718277, "learning_rate": 7.999168928007744e-06, "loss": -0.0546, "num_tokens": 19596544.0, "reward": 0.5555103421211243, "reward_std": 0.8763192296028137, "rewards/rollout_reward_func/mean": 0.5555103421211243, "rewards/rollout_reward_func/std": 0.8763192296028137, "sampling/importance_sampling_ratio/max": 1.3761258125305176, "sampling/importance_sampling_ratio/mean": 0.7843594551086426, "sampling/importance_sampling_ratio/min": 0.00015922145394142717, "sampling/sampling_logp_difference/max": 1.9884204864501953, "sampling/sampling_logp_difference/mean": 0.2973845601081848, "step": 785, "step_time": 29.196542290010257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7071938216686249, "epoch": 0.01572, "grad_norm": 0.09074666351079941, "kl": 0.3613862730562687, "learning_rate": 7.999166707478298e-06, "loss": -0.0547, "step": 786, "step_time": 14.770216421980876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.045842468738556, "epoch": 0.01574, "frac_reward_zero_std": 0.125, "grad_norm": 0.09744686633348465, "kl": 0.396263275295496, "learning_rate": 7.999164483986716e-06, "loss": -0.0407, "num_tokens": 19652937.0, "reward": 0.29017335176467896, "reward_std": 0.8229586482048035, "rewards/rollout_reward_func/mean": 0.29017335176467896, "rewards/rollout_reward_func/std": 0.8229585886001587, "sampling/importance_sampling_ratio/max": 1.2632055282592773, "sampling/importance_sampling_ratio/mean": 0.6987001895904541, "sampling/importance_sampling_ratio/min": 4.380377731649787e-07, "sampling/sampling_logp_difference/max": 2.183375120162964, "sampling/sampling_logp_difference/mean": 0.3302590847015381, "step": 787, "step_time": 28.103819446027046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0365923941135406, "epoch": 0.01576, "grad_norm": 0.09835343062877655, "kl": 0.3940436914563179, "learning_rate": 7.999162257533006e-06, "loss": -0.041, "step": 788, "step_time": 15.296838520996971 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.375, "completions/mean_terminated_length": 5.833333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6153917014598846, "epoch": 0.01578, "frac_reward_zero_std": 0.0, "grad_norm": 0.061700016260147095, "kl": 0.5437889397144318, "learning_rate": 7.999160028117167e-06, "loss": -0.0554, "num_tokens": 19714647.0, "reward": 0.42986243963241577, "reward_std": 0.8181285262107849, "rewards/rollout_reward_func/mean": 0.42986243963241577, "rewards/rollout_reward_func/std": 0.8181285858154297, "sampling/importance_sampling_ratio/max": 1.2406448125839233, "sampling/importance_sampling_ratio/mean": 0.4970911741256714, "sampling/importance_sampling_ratio/min": 1.1904500496484616e-08, "sampling/sampling_logp_difference/max": 1.9298334121704102, "sampling/sampling_logp_difference/mean": 0.41255414485931396, "step": 789, "step_time": 30.093805170996347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.616535186767578, "epoch": 0.0158, "grad_norm": 0.07281461358070374, "kl": 0.5689231771975756, "learning_rate": 7.999157795739204e-06, "loss": -0.0551, "step": 790, "step_time": 14.321555789996637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.09375, "completions/mean_terminated_length": 4.678571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7477438151836395, "epoch": 0.01582, "frac_reward_zero_std": 0.375, "grad_norm": 0.14922916889190674, "kl": 0.2332296222448349, "learning_rate": 7.999155560399117e-06, "loss": -0.0399, "num_tokens": 19765545.0, "reward": 0.5187827348709106, "reward_std": 0.8151954412460327, "rewards/rollout_reward_func/mean": 0.5187827348709106, "rewards/rollout_reward_func/std": 0.8151954412460327, "sampling/importance_sampling_ratio/max": 1.2074137926101685, "sampling/importance_sampling_ratio/mean": 0.8357334136962891, "sampling/importance_sampling_ratio/min": 1.1441537708378746e-06, "sampling/sampling_logp_difference/max": 2.004836082458496, "sampling/sampling_logp_difference/mean": 0.29310619831085205, "step": 791, "step_time": 28.879987171007087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7249129936099052, "epoch": 0.01584, "grad_norm": 0.14818672835826874, "kl": 0.23879491165280342, "learning_rate": 7.999153322096908e-06, "loss": -0.0406, "step": 792, "step_time": 13.507422105991282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 4.230769157409668, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6521884202957153, "epoch": 0.01586, "frac_reward_zero_std": 0.5, "grad_norm": 0.04359404742717743, "kl": 0.4578005373477936, "learning_rate": 7.999151080832582e-06, "loss": -0.0415, "num_tokens": 19810618.0, "reward": 0.3494170904159546, "reward_std": 0.887538492679596, "rewards/rollout_reward_func/mean": 0.3494170904159546, "rewards/rollout_reward_func/std": 0.8875384330749512, "sampling/importance_sampling_ratio/max": 1.1064327955245972, "sampling/importance_sampling_ratio/mean": 0.7606490850448608, "sampling/importance_sampling_ratio/min": 2.7456117095425725e-06, "sampling/sampling_logp_difference/max": 2.102778196334839, "sampling/sampling_logp_difference/mean": 0.27170640230178833, "step": 793, "step_time": 25.96742597402772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6357485353946686, "epoch": 0.01588, "grad_norm": 0.046929050236940384, "kl": 0.4715815410017967, "learning_rate": 7.99914883660614e-06, "loss": -0.0418, "step": 794, "step_time": 10.581928688014159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 4.5652174949646, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9763237535953522, "epoch": 0.0159, "frac_reward_zero_std": 0.125, "grad_norm": 0.2161743938922882, "kl": 0.21832281723618507, "learning_rate": 7.999146589417582e-06, "loss": -0.0566, "num_tokens": 19875247.0, "reward": 0.3150820732116699, "reward_std": 0.9227290749549866, "rewards/rollout_reward_func/mean": 0.3150820732116699, "rewards/rollout_reward_func/std": 0.9227290749549866, "sampling/importance_sampling_ratio/max": 1.2472162246704102, "sampling/importance_sampling_ratio/mean": 0.6886775493621826, "sampling/importance_sampling_ratio/min": 2.1911625935899792e-06, "sampling/sampling_logp_difference/max": 1.7429125308990479, "sampling/sampling_logp_difference/mean": 0.35068774223327637, "step": 795, "step_time": 33.13277518301038 }, { "clip_ratio/high_max": 0.021875000093132257, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 1.9453723728656769, "epoch": 0.01592, "grad_norm": 0.0537816658616066, "kl": 0.22433196008205414, "learning_rate": 7.999144339266912e-06, "loss": -0.0575, "step": 796, "step_time": 14.861167539987946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.46875, "completions/mean_terminated_length": 4.766666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4986292403191328, "epoch": 0.01594, "frac_reward_zero_std": 0.25, "grad_norm": 0.06312263756990433, "kl": 0.6824352890253067, "learning_rate": 7.999142086154133e-06, "loss": -0.0576, "num_tokens": 19920739.0, "reward": 0.6502382755279541, "reward_std": 0.8785726428031921, "rewards/rollout_reward_func/mean": 0.6502382755279541, "rewards/rollout_reward_func/std": 0.8785726428031921, "sampling/importance_sampling_ratio/max": 1.1690337657928467, "sampling/importance_sampling_ratio/mean": 0.8240424990653992, "sampling/importance_sampling_ratio/min": 3.4825237094082695e-07, "sampling/sampling_logp_difference/max": 1.9131989479064941, "sampling/sampling_logp_difference/mean": 0.3263692259788513, "step": 797, "step_time": 22.063536100962665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4819182679057121, "epoch": 0.01596, "grad_norm": 0.06287780404090881, "kl": 0.7039239667356014, "learning_rate": 7.999139830079245e-06, "loss": -0.0577, "step": 798, "step_time": 12.102284858992789 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6212685406208038, "epoch": 0.01598, "frac_reward_zero_std": 0.0, "grad_norm": 0.19878239929676056, "kl": 0.5394864529371262, "learning_rate": 7.999137571042253e-06, "loss": -0.0578, "num_tokens": 19978841.0, "reward": 0.2691521942615509, "reward_std": 0.8279897570610046, "rewards/rollout_reward_func/mean": 0.2691521942615509, "rewards/rollout_reward_func/std": 0.8279897570610046, "sampling/importance_sampling_ratio/max": 1.5424960851669312, "sampling/importance_sampling_ratio/mean": 0.7385122776031494, "sampling/importance_sampling_ratio/min": 1.0355373660786427e-06, "sampling/sampling_logp_difference/max": 2.0759353637695312, "sampling/sampling_logp_difference/mean": 0.3177270293235779, "step": 799, "step_time": 29.452760810992913 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.6074575036764145, "epoch": 0.016, "grad_norm": 0.048610664904117584, "kl": 0.5420940220355988, "learning_rate": 7.999135309043157e-06, "loss": -0.0589, "step": 800, "step_time": 14.316070840024622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.517241477966309, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.835394486784935, "epoch": 0.01602, "frac_reward_zero_std": 0.25, "grad_norm": 0.09959063678979874, "kl": 0.4272475615143776, "learning_rate": 7.99913304408196e-06, "loss": -0.038, "num_tokens": 20030675.0, "reward": 0.6477653980255127, "reward_std": 0.8015145063400269, "rewards/rollout_reward_func/mean": 0.6477653980255127, "rewards/rollout_reward_func/std": 0.8015145063400269, "sampling/importance_sampling_ratio/max": 1.2984977960586548, "sampling/importance_sampling_ratio/mean": 0.9180030822753906, "sampling/importance_sampling_ratio/min": 0.0016561876982450485, "sampling/sampling_logp_difference/max": 1.7170345783233643, "sampling/sampling_logp_difference/mean": 0.15787604451179504, "step": 801, "step_time": 25.108581382984994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.837242066860199, "epoch": 0.01604, "grad_norm": 0.0914483442902565, "kl": 0.38716705702245235, "learning_rate": 7.999130776158665e-06, "loss": -0.0381, "step": 802, "step_time": 12.637308070959989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.407407283782959, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6303406953811646, "epoch": 0.01606, "frac_reward_zero_std": 0.125, "grad_norm": 0.2543964684009552, "kl": 1.712150514125824, "learning_rate": 7.999128505273272e-06, "loss": -0.0527, "num_tokens": 20084379.0, "reward": 0.6854901313781738, "reward_std": 0.8638811111450195, "rewards/rollout_reward_func/mean": 0.6854901313781738, "rewards/rollout_reward_func/std": 0.8638810515403748, "sampling/importance_sampling_ratio/max": 1.2018961906433105, "sampling/importance_sampling_ratio/mean": 0.7325905561447144, "sampling/importance_sampling_ratio/min": 6.5075669226644095e-06, "sampling/sampling_logp_difference/max": 1.9134159088134766, "sampling/sampling_logp_difference/mean": 0.3296078145503998, "step": 803, "step_time": 24.535272112028906 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.634888842701912, "epoch": 0.01608, "grad_norm": 0.131196066737175, "kl": 1.0819358751177788, "learning_rate": 7.999126231425785e-06, "loss": -0.0548, "step": 804, "step_time": 12.133029871998588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.061361163854599, "epoch": 0.0161, "frac_reward_zero_std": 0.0, "grad_norm": 0.09520255774259567, "kl": 0.31875161081552505, "learning_rate": 7.999123954616207e-06, "loss": -0.0522, "num_tokens": 20133004.0, "reward": 0.6305561065673828, "reward_std": 0.8300628066062927, "rewards/rollout_reward_func/mean": 0.6305561065673828, "rewards/rollout_reward_func/std": 0.8300628066062927, "sampling/importance_sampling_ratio/max": 1.4339993000030518, "sampling/importance_sampling_ratio/mean": 0.7435231804847717, "sampling/importance_sampling_ratio/min": 3.057967319364252e-07, "sampling/sampling_logp_difference/max": 1.933735966682434, "sampling/sampling_logp_difference/mean": 0.33325445652008057, "step": 805, "step_time": 21.994975178997265 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 2.072662800550461, "epoch": 0.01612, "grad_norm": 0.09485321491956711, "kl": 0.2926889844238758, "learning_rate": 7.999121674844537e-06, "loss": -0.0523, "step": 806, "step_time": 10.70210229200893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 5.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6637073010206223, "epoch": 0.01614, "frac_reward_zero_std": 0.25, "grad_norm": 0.0698610246181488, "kl": 0.3066278137266636, "learning_rate": 7.999119392110784e-06, "loss": -0.072, "num_tokens": 20185797.0, "reward": 0.6716957092285156, "reward_std": 0.8952484130859375, "rewards/rollout_reward_func/mean": 0.6716957092285156, "rewards/rollout_reward_func/std": 0.8952484130859375, "sampling/importance_sampling_ratio/max": 1.2556697130203247, "sampling/importance_sampling_ratio/mean": 0.7657788991928101, "sampling/importance_sampling_ratio/min": 0.00011345107486704364, "sampling/sampling_logp_difference/max": 2.0159878730773926, "sampling/sampling_logp_difference/mean": 0.2774573564529419, "step": 807, "step_time": 26.025147763983114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.67275570333004, "epoch": 0.01616, "grad_norm": 0.06927423924207687, "kl": 0.29405049607157707, "learning_rate": 7.99911710641494e-06, "loss": -0.0719, "step": 808, "step_time": 12.812209354975494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 6.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.369326114654541, "epoch": 0.01618, "frac_reward_zero_std": 0.125, "grad_norm": 0.06016016751527786, "kl": 0.3945281133055687, "learning_rate": 7.999114817757016e-06, "loss": -0.0602, "num_tokens": 20234471.0, "reward": 0.6125873327255249, "reward_std": 0.7927654981613159, "rewards/rollout_reward_func/mean": 0.6125873327255249, "rewards/rollout_reward_func/std": 0.7927654385566711, "sampling/importance_sampling_ratio/max": 1.335774302482605, "sampling/importance_sampling_ratio/mean": 0.6663535833358765, "sampling/importance_sampling_ratio/min": 2.0966622287232894e-06, "sampling/sampling_logp_difference/max": 1.9387516975402832, "sampling/sampling_logp_difference/mean": 0.4037265181541443, "step": 809, "step_time": 25.264356047002366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.375692069530487, "epoch": 0.0162, "grad_norm": 0.0579238124191761, "kl": 0.38728996366262436, "learning_rate": 7.999112526137013e-06, "loss": -0.0604, "step": 810, "step_time": 13.34939758697874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.497668832540512, "epoch": 0.01622, "frac_reward_zero_std": 0.125, "grad_norm": 0.06727741658687592, "kl": 0.5373772624880075, "learning_rate": 7.999110231554928e-06, "loss": -0.0629, "num_tokens": 20288790.0, "reward": 0.473224937915802, "reward_std": 0.9317904114723206, "rewards/rollout_reward_func/mean": 0.473224937915802, "rewards/rollout_reward_func/std": 0.9317904114723206, "sampling/importance_sampling_ratio/max": 1.2586100101470947, "sampling/importance_sampling_ratio/mean": 0.7408923506736755, "sampling/importance_sampling_ratio/min": 0.0004215874068904668, "sampling/sampling_logp_difference/max": 1.7888712882995605, "sampling/sampling_logp_difference/mean": 0.23287542164325714, "step": 811, "step_time": 29.45422191498801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4998324811458588, "epoch": 0.01624, "grad_norm": 0.06761960685253143, "kl": 0.5449909064918756, "learning_rate": 7.99910793401077e-06, "loss": -0.0631, "step": 812, "step_time": 14.972348103998229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.5, "completions/mean_terminated_length": 5.925926208496094, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.204633116722107, "epoch": 0.01626, "frac_reward_zero_std": 0.0, "grad_norm": 0.06975985318422318, "kl": 0.2755882404744625, "learning_rate": 7.999105633504535e-06, "loss": -0.0838, "num_tokens": 20337390.0, "reward": 0.3582146167755127, "reward_std": 0.971768319606781, "rewards/rollout_reward_func/mean": 0.3582146167755127, "rewards/rollout_reward_func/std": 0.9717682600021362, "sampling/importance_sampling_ratio/max": 1.182763934135437, "sampling/importance_sampling_ratio/mean": 0.6285659670829773, "sampling/importance_sampling_ratio/min": 2.069231868517818e-06, "sampling/sampling_logp_difference/max": 2.683523178100586, "sampling/sampling_logp_difference/mean": 0.36834055185317993, "step": 813, "step_time": 24.78792926800088 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.1965847313404083, "epoch": 0.01628, "grad_norm": 0.08168615400791168, "kl": 0.2679141238331795, "learning_rate": 7.99910333003623e-06, "loss": -0.0837, "step": 814, "step_time": 12.092536273004953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.03125, "completions/mean_terminated_length": 4.607142925262451, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4857698678970337, "epoch": 0.0163, "frac_reward_zero_std": 0.125, "grad_norm": 0.11738945543766022, "kl": 0.2190195769071579, "learning_rate": 7.999101023605855e-06, "loss": -0.0497, "num_tokens": 20392346.0, "reward": 0.7609506249427795, "reward_std": 0.8375509977340698, "rewards/rollout_reward_func/mean": 0.7609506249427795, "rewards/rollout_reward_func/std": 0.8375509977340698, "sampling/importance_sampling_ratio/max": 1.332183837890625, "sampling/importance_sampling_ratio/mean": 0.7526181936264038, "sampling/importance_sampling_ratio/min": 9.227827104041353e-05, "sampling/sampling_logp_difference/max": 1.8879929780960083, "sampling/sampling_logp_difference/mean": 0.24899399280548096, "step": 815, "step_time": 25.47015931896749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4802323132753372, "epoch": 0.01632, "grad_norm": 0.11702965199947357, "kl": 0.2231472246348858, "learning_rate": 7.999098714213413e-06, "loss": -0.05, "step": 816, "step_time": 12.881191085994942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2627620995044708, "epoch": 0.01634, "frac_reward_zero_std": 0.25, "grad_norm": 0.05563928186893463, "kl": 0.24817654490470886, "learning_rate": 7.999096401858905e-06, "loss": -0.0461, "num_tokens": 20450786.0, "reward": 0.8623107671737671, "reward_std": 0.7611375451087952, "rewards/rollout_reward_func/mean": 0.8623107671737671, "rewards/rollout_reward_func/std": 0.7611375451087952, "sampling/importance_sampling_ratio/max": 1.60480797290802, "sampling/importance_sampling_ratio/mean": 0.8690903186798096, "sampling/importance_sampling_ratio/min": 0.0011059350799769163, "sampling/sampling_logp_difference/max": 1.529299259185791, "sampling/sampling_logp_difference/mean": 0.19819262623786926, "step": 817, "step_time": 28.779917956038844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2550781443715096, "epoch": 0.01636, "grad_norm": 0.05366170033812523, "kl": 0.25179215893149376, "learning_rate": 7.999094086542335e-06, "loss": -0.0462, "step": 818, "step_time": 14.932937435019994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.84375, "completions/mean_terminated_length": 5.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6868855506181717, "epoch": 0.01638, "frac_reward_zero_std": 0.125, "grad_norm": 0.11777252703905106, "kl": 0.40193886309862137, "learning_rate": 7.999091768263706e-06, "loss": -0.0691, "num_tokens": 20498177.0, "reward": 0.9809650182723999, "reward_std": 0.6553592681884766, "rewards/rollout_reward_func/mean": 0.9809650182723999, "rewards/rollout_reward_func/std": 0.6553592681884766, "sampling/importance_sampling_ratio/max": 1.1733596324920654, "sampling/importance_sampling_ratio/mean": 0.6766510009765625, "sampling/importance_sampling_ratio/min": 5.971186328679323e-05, "sampling/sampling_logp_difference/max": 2.0212624073028564, "sampling/sampling_logp_difference/mean": 0.2976066470146179, "step": 819, "step_time": 23.484196946956217 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.004032257944345474, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007938507944345474, "entropy": 1.6800422817468643, "epoch": 0.0164, "grad_norm": 0.04904362931847572, "kl": 0.41795285791158676, "learning_rate": 7.999089447023018e-06, "loss": -0.0695, "step": 820, "step_time": 12.272474198020063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.933333396911621, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4508790634572506, "epoch": 0.01642, "frac_reward_zero_std": 0.25, "grad_norm": 0.07096479088068008, "kl": 0.45093751326203346, "learning_rate": 7.999087122820274e-06, "loss": -0.0305, "num_tokens": 20549820.0, "reward": 0.4746659994125366, "reward_std": 0.8118072152137756, "rewards/rollout_reward_func/mean": 0.4746659994125366, "rewards/rollout_reward_func/std": 0.8118072152137756, "sampling/importance_sampling_ratio/max": 1.3376089334487915, "sampling/importance_sampling_ratio/mean": 0.8387846946716309, "sampling/importance_sampling_ratio/min": 4.4414093736122595e-07, "sampling/sampling_logp_difference/max": 1.7053858041763306, "sampling/sampling_logp_difference/mean": 0.2671840190887451, "step": 821, "step_time": 20.882896863040514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4467222094535828, "epoch": 0.01644, "grad_norm": 0.05757167190313339, "kl": 0.4466933608055115, "learning_rate": 7.999084795655478e-06, "loss": -0.0308, "step": 822, "step_time": 11.29335585705121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.740740776062012, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.764783427119255, "epoch": 0.01646, "frac_reward_zero_std": 0.125, "grad_norm": 0.09544014185667038, "kl": 0.36634183675050735, "learning_rate": 7.99908246552863e-06, "loss": -0.0686, "num_tokens": 20603502.0, "reward": 0.5629494190216064, "reward_std": 0.8879826068878174, "rewards/rollout_reward_func/mean": 0.5629494190216064, "rewards/rollout_reward_func/std": 0.8879826068878174, "sampling/importance_sampling_ratio/max": 1.51034677028656, "sampling/importance_sampling_ratio/mean": 0.7159417867660522, "sampling/importance_sampling_ratio/min": 9.711595339467749e-05, "sampling/sampling_logp_difference/max": 1.741112470626831, "sampling/sampling_logp_difference/mean": 0.2850901186466217, "step": 823, "step_time": 29.418732347985497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7547958865761757, "epoch": 0.01648, "grad_norm": 0.0982859805226326, "kl": 0.3868676573038101, "learning_rate": 7.999080132439733e-06, "loss": -0.0692, "step": 824, "step_time": 15.548312883998733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.53125, "completions/mean_terminated_length": 5.136363983154297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5627898573875427, "epoch": 0.0165, "frac_reward_zero_std": 0.125, "grad_norm": 0.07266105711460114, "kl": 0.5022385530173779, "learning_rate": 7.99907779638879e-06, "loss": -0.0707, "num_tokens": 20665163.0, "reward": 0.007010415196418762, "reward_std": 0.8351149559020996, "rewards/rollout_reward_func/mean": 0.007010415196418762, "rewards/rollout_reward_func/std": 0.8351148962974548, "sampling/importance_sampling_ratio/max": 1.9197202920913696, "sampling/importance_sampling_ratio/mean": 0.5708000659942627, "sampling/importance_sampling_ratio/min": 1.0345024037405892e-07, "sampling/sampling_logp_difference/max": 1.9000338315963745, "sampling/sampling_logp_difference/mean": 0.3746642470359802, "step": 825, "step_time": 30.458460599009413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5663280487060547, "epoch": 0.01652, "grad_norm": 0.07442431151866913, "kl": 0.49426450952887535, "learning_rate": 7.999075457375802e-06, "loss": -0.0712, "step": 826, "step_time": 13.818647747975774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1955197006464005, "epoch": 0.01654, "frac_reward_zero_std": 0.375, "grad_norm": 0.0645846500992775, "kl": 0.3265988491475582, "learning_rate": 7.999073115400772e-06, "loss": -0.029, "num_tokens": 20718558.0, "reward": 0.5250346660614014, "reward_std": 0.8439965844154358, "rewards/rollout_reward_func/mean": 0.5250346660614014, "rewards/rollout_reward_func/std": 0.8439965844154358, "sampling/importance_sampling_ratio/max": 1.4082111120224, "sampling/importance_sampling_ratio/mean": 0.8433312773704529, "sampling/importance_sampling_ratio/min": 3.1794236576843105e-08, "sampling/sampling_logp_difference/max": 2.953113079071045, "sampling/sampling_logp_difference/mean": 0.2796557545661926, "step": 827, "step_time": 23.778454057057388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1967628225684166, "epoch": 0.01656, "grad_norm": 0.05566530302166939, "kl": 0.3418833389878273, "learning_rate": 7.999070770463702e-06, "loss": -0.0292, "step": 828, "step_time": 12.399523323983885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.555555820465088, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8252372443675995, "epoch": 0.01658, "frac_reward_zero_std": 0.125, "grad_norm": 0.0772741511464119, "kl": 0.28476518020033836, "learning_rate": 7.999068422564597e-06, "loss": -0.0291, "num_tokens": 20780160.0, "reward": 0.686318039894104, "reward_std": 0.8238630294799805, "rewards/rollout_reward_func/mean": 0.686318039894104, "rewards/rollout_reward_func/std": 0.8238629698753357, "sampling/importance_sampling_ratio/max": 1.4591408967971802, "sampling/importance_sampling_ratio/mean": 0.8832836747169495, "sampling/importance_sampling_ratio/min": 6.270554564480335e-08, "sampling/sampling_logp_difference/max": 2.015711784362793, "sampling/sampling_logp_difference/mean": 0.3068038821220398, "step": 829, "step_time": 30.138993090979056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8296413123607635, "epoch": 0.0166, "grad_norm": 0.07583315670490265, "kl": 0.27744070440530777, "learning_rate": 7.999066071703455e-06, "loss": -0.0292, "step": 830, "step_time": 15.193787847994827 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.660905221477151, "epoch": 0.01662, "frac_reward_zero_std": 0.25, "grad_norm": 0.11199713498353958, "kl": 0.3690731357783079, "learning_rate": 7.99906371788028e-06, "loss": -0.0498, "num_tokens": 20828632.0, "reward": 0.4410763084888458, "reward_std": 0.9323143362998962, "rewards/rollout_reward_func/mean": 0.4410763084888458, "rewards/rollout_reward_func/std": 0.9323143362998962, "sampling/importance_sampling_ratio/max": 1.2350417375564575, "sampling/importance_sampling_ratio/mean": 0.7956812977790833, "sampling/importance_sampling_ratio/min": 3.376712356839562e-06, "sampling/sampling_logp_difference/max": 2.037595510482788, "sampling/sampling_logp_difference/mean": 0.32622119784355164, "step": 831, "step_time": 23.31076490000123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6716226264834404, "epoch": 0.01664, "grad_norm": 0.11057266592979431, "kl": 0.3781994543969631, "learning_rate": 7.999061361095076e-06, "loss": -0.0503, "step": 832, "step_time": 12.162081019952893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.96875, "completions/mean_terminated_length": 4.440000057220459, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.030778829008341, "epoch": 0.01666, "frac_reward_zero_std": 0.0, "grad_norm": 0.06058092042803764, "kl": 0.19213157333433628, "learning_rate": 7.999059001347845e-06, "loss": -0.0471, "num_tokens": 20894213.0, "reward": 0.4697532057762146, "reward_std": 0.824857771396637, "rewards/rollout_reward_func/mean": 0.4697532057762146, "rewards/rollout_reward_func/std": 0.824857771396637, "sampling/importance_sampling_ratio/max": 1.40140700340271, "sampling/importance_sampling_ratio/mean": 0.828186571598053, "sampling/importance_sampling_ratio/min": 7.343865036091302e-07, "sampling/sampling_logp_difference/max": 2.0381953716278076, "sampling/sampling_logp_difference/mean": 0.34258735179901123, "step": 833, "step_time": 29.79219092996209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0419010519981384, "epoch": 0.01668, "grad_norm": 0.06454675644636154, "kl": 0.1882983110845089, "learning_rate": 7.999056638638587e-06, "loss": -0.0475, "step": 834, "step_time": 15.616755769005977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0249923169612885, "epoch": 0.0167, "frac_reward_zero_std": 0.125, "grad_norm": 0.05989727005362511, "kl": 0.19702517986297607, "learning_rate": 7.999054272967306e-06, "loss": -0.055, "num_tokens": 20941362.0, "reward": 0.6671504974365234, "reward_std": 0.9051666855812073, "rewards/rollout_reward_func/mean": 0.6671504974365234, "rewards/rollout_reward_func/std": 0.905166745185852, "sampling/importance_sampling_ratio/max": 1.2939350605010986, "sampling/importance_sampling_ratio/mean": 0.7884616255760193, "sampling/importance_sampling_ratio/min": 5.0990429656394554e-08, "sampling/sampling_logp_difference/max": 2.2539305686950684, "sampling/sampling_logp_difference/mean": 0.3852695822715759, "step": 835, "step_time": 24.59571666494594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0302489697933197, "epoch": 0.01672, "grad_norm": 0.0594017319381237, "kl": 0.19729754328727722, "learning_rate": 7.999051904334003e-06, "loss": -0.0552, "step": 836, "step_time": 11.47575901102391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.59375, "completions/mean_terminated_length": 5.227272987365723, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5767871737480164, "epoch": 0.01674, "frac_reward_zero_std": 0.0, "grad_norm": 0.0678325891494751, "kl": 0.17666572518646717, "learning_rate": 7.999049532738682e-06, "loss": -0.0922, "num_tokens": 21007442.0, "reward": 0.05184940993785858, "reward_std": 0.8021258115768433, "rewards/rollout_reward_func/mean": 0.05184940993785858, "rewards/rollout_reward_func/std": 0.8021258115768433, "sampling/importance_sampling_ratio/max": 1.4759103059768677, "sampling/importance_sampling_ratio/mean": 0.574312150478363, "sampling/importance_sampling_ratio/min": 0.0001925444812513888, "sampling/sampling_logp_difference/max": 1.8369579315185547, "sampling/sampling_logp_difference/mean": 0.33818724751472473, "step": 837, "step_time": 38.16452724498231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5759360790252686, "epoch": 0.01676, "grad_norm": 0.06859785318374634, "kl": 0.18020425736904144, "learning_rate": 7.999047158181346e-06, "loss": -0.0921, "step": 838, "step_time": 17.255755417980254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 5.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9749728441238403, "epoch": 0.01678, "frac_reward_zero_std": 0.375, "grad_norm": 0.050173915922641754, "kl": 0.26584640704095364, "learning_rate": 7.999044780661995e-06, "loss": -0.0411, "num_tokens": 21060129.0, "reward": 0.6796141862869263, "reward_std": 0.9765206575393677, "rewards/rollout_reward_func/mean": 0.6796141862869263, "rewards/rollout_reward_func/std": 0.9765206575393677, "sampling/importance_sampling_ratio/max": 1.2205272912979126, "sampling/importance_sampling_ratio/mean": 0.6425314545631409, "sampling/importance_sampling_ratio/min": 1.5158738278842065e-05, "sampling/sampling_logp_difference/max": 1.9555418491363525, "sampling/sampling_logp_difference/mean": 0.29602956771850586, "step": 839, "step_time": 30.23261210601777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9722106456756592, "epoch": 0.0168, "grad_norm": 0.05108048766851425, "kl": 0.2607864458113909, "learning_rate": 7.999042400180632e-06, "loss": -0.0411, "step": 840, "step_time": 12.437362009019125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.28125, "completions/mean_terminated_length": 4.839999675750732, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9430715441703796, "epoch": 0.01682, "frac_reward_zero_std": 0.125, "grad_norm": 0.09855324774980545, "kl": 0.14357001148164272, "learning_rate": 7.99904001673726e-06, "loss": -0.0548, "num_tokens": 21118773.0, "reward": 0.4062834084033966, "reward_std": 0.915442168712616, "rewards/rollout_reward_func/mean": 0.4062834084033966, "rewards/rollout_reward_func/std": 0.915442168712616, "sampling/importance_sampling_ratio/max": 1.3566207885742188, "sampling/importance_sampling_ratio/mean": 0.7029029726982117, "sampling/importance_sampling_ratio/min": 7.039444426482078e-06, "sampling/sampling_logp_difference/max": 1.9381966590881348, "sampling/sampling_logp_difference/mean": 0.30276960134506226, "step": 841, "step_time": 30.505184148001717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.950130820274353, "epoch": 0.01684, "grad_norm": 0.12120742350816727, "kl": 0.1469513438642025, "learning_rate": 7.99903763033188e-06, "loss": -0.0554, "step": 842, "step_time": 14.395455123012653 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 4.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.125598222017288, "epoch": 0.01686, "frac_reward_zero_std": 0.125, "grad_norm": 0.12263032793998718, "kl": 0.2084354292601347, "learning_rate": 7.999035240964499e-06, "loss": -0.0599, "num_tokens": 21172227.0, "reward": 0.4559197425842285, "reward_std": 0.9602292776107788, "rewards/rollout_reward_func/mean": 0.4559197425842285, "rewards/rollout_reward_func/std": 0.9602292776107788, "sampling/importance_sampling_ratio/max": 1.3002097606658936, "sampling/importance_sampling_ratio/mean": 0.7120853662490845, "sampling/importance_sampling_ratio/min": 6.323825800791383e-05, "sampling/sampling_logp_difference/max": 1.537572979927063, "sampling/sampling_logp_difference/mean": 0.3205394744873047, "step": 843, "step_time": 27.205220439995173 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.1205652356147766, "epoch": 0.01688, "grad_norm": 0.11867500841617584, "kl": 0.21257999539375305, "learning_rate": 7.999032848635112e-06, "loss": -0.0603, "step": 844, "step_time": 12.870134889992187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.53125, "completions/mean_terminated_length": 5.159999847412109, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.276849329471588, "epoch": 0.0169, "frac_reward_zero_std": 0.0, "grad_norm": 0.15498776733875275, "kl": 0.31471559032797813, "learning_rate": 7.999030453343725e-06, "loss": -0.0726, "num_tokens": 21226985.0, "reward": 0.6130123138427734, "reward_std": 0.8928760290145874, "rewards/rollout_reward_func/mean": 0.6130123138427734, "rewards/rollout_reward_func/std": 0.8928759694099426, "sampling/importance_sampling_ratio/max": 1.3490546941757202, "sampling/importance_sampling_ratio/mean": 0.7153297066688538, "sampling/importance_sampling_ratio/min": 2.9783628633595072e-05, "sampling/sampling_logp_difference/max": 1.775838017463684, "sampling/sampling_logp_difference/mean": 0.31179070472717285, "step": 845, "step_time": 26.82773974101292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2754234075546265, "epoch": 0.01692, "grad_norm": 0.17542457580566406, "kl": 0.3018126003444195, "learning_rate": 7.999028055090344e-06, "loss": -0.0731, "step": 846, "step_time": 12.814395155990496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.1875, "completions/mean_terminated_length": 5.153846263885498, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0815663039684296, "epoch": 0.01694, "frac_reward_zero_std": 0.125, "grad_norm": 0.0626460388302803, "kl": 0.6170611456036568, "learning_rate": 7.999025653874965e-06, "loss": -0.0698, "num_tokens": 21286678.0, "reward": 0.5422077775001526, "reward_std": 0.8603806495666504, "rewards/rollout_reward_func/mean": 0.5422077775001526, "rewards/rollout_reward_func/std": 0.8603805303573608, "sampling/importance_sampling_ratio/max": 1.3470510244369507, "sampling/importance_sampling_ratio/mean": 0.6494239568710327, "sampling/importance_sampling_ratio/min": 3.25106924492502e-07, "sampling/sampling_logp_difference/max": 3.0143778324127197, "sampling/sampling_logp_difference/mean": 0.3854912519454956, "step": 847, "step_time": 26.11781562498072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.0884912312030792, "epoch": 0.01696, "grad_norm": 0.05125686898827553, "kl": 0.5628156922757626, "learning_rate": 7.999023249697595e-06, "loss": -0.0701, "step": 848, "step_time": 12.300298860034673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 4.304347991943359, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1052022576332092, "epoch": 0.01698, "frac_reward_zero_std": 0.0, "grad_norm": 0.11219729483127594, "kl": 0.22969936206936836, "learning_rate": 7.999020842558234e-06, "loss": -0.0517, "num_tokens": 21350247.0, "reward": 0.4792092740535736, "reward_std": 0.9288373589515686, "rewards/rollout_reward_func/mean": 0.4792092740535736, "rewards/rollout_reward_func/std": 0.928837239742279, "sampling/importance_sampling_ratio/max": 1.158367395401001, "sampling/importance_sampling_ratio/mean": 0.6571782827377319, "sampling/importance_sampling_ratio/min": 4.321676533436403e-05, "sampling/sampling_logp_difference/max": 1.8110682964324951, "sampling/sampling_logp_difference/mean": 0.2984163761138916, "step": 849, "step_time": 30.089228424069006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.112370401620865, "epoch": 0.017, "grad_norm": 0.10445111989974976, "kl": 0.22425217553973198, "learning_rate": 7.999018432456886e-06, "loss": -0.052, "step": 850, "step_time": 13.840566357976058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.9375, "completions/mean_terminated_length": 4.782608985900879, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8564067780971527, "epoch": 0.01702, "frac_reward_zero_std": 0.125, "grad_norm": 0.05130708962678909, "kl": 0.20657800510525703, "learning_rate": 7.99901601939355e-06, "loss": -0.0733, "num_tokens": 21401170.0, "reward": 0.4782482385635376, "reward_std": 0.9901541471481323, "rewards/rollout_reward_func/mean": 0.4782482385635376, "rewards/rollout_reward_func/std": 0.9901540875434875, "sampling/importance_sampling_ratio/max": 1.4031028747558594, "sampling/importance_sampling_ratio/mean": 0.7032768726348877, "sampling/importance_sampling_ratio/min": 5.281259873868294e-09, "sampling/sampling_logp_difference/max": 2.162855625152588, "sampling/sampling_logp_difference/mean": 0.33127209544181824, "step": 851, "step_time": 27.251834120048443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.860082034021616, "epoch": 0.01704, "grad_norm": 0.05142707750201225, "kl": 0.19933446124196053, "learning_rate": 7.999013603368233e-06, "loss": -0.0734, "step": 852, "step_time": 11.99273647699738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 5.161290168762207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.018859088420868, "epoch": 0.01706, "frac_reward_zero_std": 0.125, "grad_norm": 0.12421519309282303, "kl": 0.33490995690226555, "learning_rate": 7.999011184380935e-06, "loss": -0.0591, "num_tokens": 21454974.0, "reward": 0.36800429224967957, "reward_std": 0.870977520942688, "rewards/rollout_reward_func/mean": 0.36800429224967957, "rewards/rollout_reward_func/std": 0.8709774613380432, "sampling/importance_sampling_ratio/max": 1.4115948677062988, "sampling/importance_sampling_ratio/mean": 0.7304352521896362, "sampling/importance_sampling_ratio/min": 1.7588721675565466e-05, "sampling/sampling_logp_difference/max": 1.9496819972991943, "sampling/sampling_logp_difference/mean": 0.34900668263435364, "step": 853, "step_time": 26.540759966010228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0071484744548798, "epoch": 0.01708, "grad_norm": 0.11864978820085526, "kl": 0.3418263681232929, "learning_rate": 7.999008762431658e-06, "loss": -0.0594, "step": 854, "step_time": 12.171815996000078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.4375, "completions/mean_terminated_length": 4.583333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.011922851204872, "epoch": 0.0171, "frac_reward_zero_std": 0.125, "grad_norm": 0.09087826311588287, "kl": 0.21941879391670227, "learning_rate": 7.999006337520404e-06, "loss": -0.0807, "num_tokens": 21511084.0, "reward": 0.5726751089096069, "reward_std": 0.9317570328712463, "rewards/rollout_reward_func/mean": 0.5726751089096069, "rewards/rollout_reward_func/std": 0.9317569732666016, "sampling/importance_sampling_ratio/max": 1.2106127738952637, "sampling/importance_sampling_ratio/mean": 0.6579992771148682, "sampling/importance_sampling_ratio/min": 3.192499571014196e-05, "sampling/sampling_logp_difference/max": 1.6116209030151367, "sampling/sampling_logp_difference/mean": 0.32893669605255127, "step": 855, "step_time": 26.867400670947973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0010465309023857, "epoch": 0.01712, "grad_norm": 0.08152693510055542, "kl": 0.21531729400157928, "learning_rate": 7.999003909647179e-06, "loss": -0.0812, "step": 856, "step_time": 12.370268366968958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9590262360870838, "epoch": 0.01714, "frac_reward_zero_std": 0.25, "grad_norm": 0.04585384950041771, "kl": 0.2663996107876301, "learning_rate": 7.99900147881198e-06, "loss": -0.0474, "num_tokens": 21564200.0, "reward": 0.9006513357162476, "reward_std": 0.7466962337493896, "rewards/rollout_reward_func/mean": 0.9006513357162476, "rewards/rollout_reward_func/std": 0.7466961741447449, "sampling/importance_sampling_ratio/max": 1.2262550592422485, "sampling/importance_sampling_ratio/mean": 0.9033082127571106, "sampling/importance_sampling_ratio/min": 5.563688318943605e-05, "sampling/sampling_logp_difference/max": 1.8464202880859375, "sampling/sampling_logp_difference/mean": 0.17673268914222717, "step": 857, "step_time": 29.474534409993794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.938340350985527, "epoch": 0.01716, "grad_norm": 0.04320986941456795, "kl": 0.2667730525135994, "learning_rate": 7.998999045014814e-06, "loss": -0.0475, "step": 858, "step_time": 15.687890574976336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.78125, "completions/mean_terminated_length": 4.5652174949646, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.385107100009918, "epoch": 0.01718, "frac_reward_zero_std": 0.125, "grad_norm": 0.181269109249115, "kl": 0.19874883070588112, "learning_rate": 7.99899660825568e-06, "loss": -0.0708, "num_tokens": 21616953.0, "reward": 0.25523805618286133, "reward_std": 0.9630568623542786, "rewards/rollout_reward_func/mean": 0.25523805618286133, "rewards/rollout_reward_func/std": 0.9630568623542786, "sampling/importance_sampling_ratio/max": 1.4730616807937622, "sampling/importance_sampling_ratio/mean": 0.671506404876709, "sampling/importance_sampling_ratio/min": 9.343505371361971e-05, "sampling/sampling_logp_difference/max": 2.296910524368286, "sampling/sampling_logp_difference/mean": 0.3339790403842926, "step": 859, "step_time": 27.11069632600993 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.370778441429138, "epoch": 0.0172, "grad_norm": 0.09353435784578323, "kl": 0.21180923655629158, "learning_rate": 7.998994168534584e-06, "loss": -0.0717, "step": 860, "step_time": 12.559881752997171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0, "completions/mean_terminated_length": 4.266666889190674, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0555258989334106, "epoch": 0.01722, "frac_reward_zero_std": 0.25, "grad_norm": 0.08603113889694214, "kl": 0.5743136778473854, "learning_rate": 7.998991725851526e-06, "loss": -0.0569, "num_tokens": 21664343.0, "reward": 0.8966318368911743, "reward_std": 0.7867602109909058, "rewards/rollout_reward_func/mean": 0.8966318368911743, "rewards/rollout_reward_func/std": 0.7867602109909058, "sampling/importance_sampling_ratio/max": 1.251609206199646, "sampling/importance_sampling_ratio/mean": 0.8462119102478027, "sampling/importance_sampling_ratio/min": 1.3471626516547985e-05, "sampling/sampling_logp_difference/max": 1.6566548347473145, "sampling/sampling_logp_difference/mean": 0.21713364124298096, "step": 861, "step_time": 24.123927274980815 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012531328480690718, "entropy": 1.0442683696746826, "epoch": 0.01724, "grad_norm": 0.03444519266486168, "kl": 0.6086797565221786, "learning_rate": 7.998989280206506e-06, "loss": -0.0572, "step": 862, "step_time": 12.508289511984913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1596190612763166, "epoch": 0.01726, "frac_reward_zero_std": 0.125, "grad_norm": 0.17181383073329926, "kl": 0.6608027294278145, "learning_rate": 7.998986831599532e-06, "loss": -0.0546, "num_tokens": 21721250.0, "reward": 0.5698708891868591, "reward_std": 0.8261920213699341, "rewards/rollout_reward_func/mean": 0.5698708891868591, "rewards/rollout_reward_func/std": 0.8261920213699341, "sampling/importance_sampling_ratio/max": 1.5989474058151245, "sampling/importance_sampling_ratio/mean": 0.931370735168457, "sampling/importance_sampling_ratio/min": 4.831608384847641e-05, "sampling/sampling_logp_difference/max": 1.5972094535827637, "sampling/sampling_logp_difference/mean": 0.21512636542320251, "step": 863, "step_time": 25.662503283005208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.158445494249463, "epoch": 0.01728, "grad_norm": 0.18354183435440063, "kl": 0.6893778592348099, "learning_rate": 7.998984380030604e-06, "loss": -0.0548, "step": 864, "step_time": 13.15668790001655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8596544731408358, "epoch": 0.0173, "frac_reward_zero_std": 0.25, "grad_norm": 0.12249686568975449, "kl": 1.2679208368062973, "learning_rate": 7.998981925499722e-06, "loss": -0.0403, "num_tokens": 21772389.0, "reward": 0.8309738636016846, "reward_std": 0.7616227865219116, "rewards/rollout_reward_func/mean": 0.8309738636016846, "rewards/rollout_reward_func/std": 0.7616227865219116, "sampling/importance_sampling_ratio/max": 1.3583801984786987, "sampling/importance_sampling_ratio/mean": 0.8705856800079346, "sampling/importance_sampling_ratio/min": 3.027699676749762e-06, "sampling/sampling_logp_difference/max": 2.189063549041748, "sampling/sampling_logp_difference/mean": 0.19625669717788696, "step": 865, "step_time": 22.97899504599627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8606952019035816, "epoch": 0.01732, "grad_norm": 0.10597161203622818, "kl": 1.1366308629512787, "learning_rate": 7.998979468006891e-06, "loss": -0.0409, "step": 866, "step_time": 12.133453245973215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0238401517271996, "epoch": 0.01734, "frac_reward_zero_std": 0.25, "grad_norm": 0.1516282707452774, "kl": 0.21698148921132088, "learning_rate": 7.998977007552115e-06, "loss": -0.0187, "num_tokens": 21822263.0, "reward": 0.7109659910202026, "reward_std": 0.9172026515007019, "rewards/rollout_reward_func/mean": 0.7109659910202026, "rewards/rollout_reward_func/std": 0.9172025918960571, "sampling/importance_sampling_ratio/max": 1.2245092391967773, "sampling/importance_sampling_ratio/mean": 0.8666074275970459, "sampling/importance_sampling_ratio/min": 7.252211071318015e-05, "sampling/sampling_logp_difference/max": 1.4544576406478882, "sampling/sampling_logp_difference/mean": 0.19571077823638916, "step": 867, "step_time": 23.135043410002254 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.0098217949271202, "epoch": 0.01736, "grad_norm": 0.15152400732040405, "kl": 0.21390926465392113, "learning_rate": 7.998974544135391e-06, "loss": -0.0191, "step": 868, "step_time": 11.95369719399605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.25, "completions/mean_terminated_length": 4.533333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.043896034359932, "epoch": 0.01738, "frac_reward_zero_std": 0.375, "grad_norm": 0.07837478816509247, "kl": 0.5797938778996468, "learning_rate": 7.998972077756728e-06, "loss": -0.0333, "num_tokens": 21881924.0, "reward": 0.8765965700149536, "reward_std": 0.7270304560661316, "rewards/rollout_reward_func/mean": 0.8765965700149536, "rewards/rollout_reward_func/std": 0.7270303964614868, "sampling/importance_sampling_ratio/max": 1.1715799570083618, "sampling/importance_sampling_ratio/mean": 0.8548582792282104, "sampling/importance_sampling_ratio/min": 1.7926700820680708e-05, "sampling/sampling_logp_difference/max": 1.9887622594833374, "sampling/sampling_logp_difference/mean": 0.19838029146194458, "step": 869, "step_time": 28.070176463981625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0487413108348846, "epoch": 0.0174, "grad_norm": 0.07779128104448318, "kl": 0.5604914501309395, "learning_rate": 7.998969608416123e-06, "loss": -0.0332, "step": 870, "step_time": 14.958668374049012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.5806450843811035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9015556685626507, "epoch": 0.01742, "frac_reward_zero_std": 0.125, "grad_norm": 0.07073429971933365, "kl": 0.8960242792963982, "learning_rate": 7.998967136113582e-06, "loss": -0.0387, "num_tokens": 21933256.0, "reward": 0.6943552494049072, "reward_std": 0.76622474193573, "rewards/rollout_reward_func/mean": 0.6943552494049072, "rewards/rollout_reward_func/std": 0.76622474193573, "sampling/importance_sampling_ratio/max": 1.3154808282852173, "sampling/importance_sampling_ratio/mean": 0.882864773273468, "sampling/importance_sampling_ratio/min": 0.0002836627245415002, "sampling/sampling_logp_difference/max": 1.8578095436096191, "sampling/sampling_logp_difference/mean": 0.18024152517318726, "step": 871, "step_time": 24.505523027008167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9051999300718307, "epoch": 0.01744, "grad_norm": 0.060752980411052704, "kl": 0.8010500743985176, "learning_rate": 7.998964660849104e-06, "loss": -0.0391, "step": 872, "step_time": 12.727252708020387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.896551609039307, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3713093996047974, "epoch": 0.01746, "frac_reward_zero_std": 0.375, "grad_norm": 0.0486307293176651, "kl": 0.31639454513788223, "learning_rate": 7.998962182622696e-06, "loss": -0.0396, "num_tokens": 21988097.0, "reward": 0.5123941898345947, "reward_std": 0.7728413939476013, "rewards/rollout_reward_func/mean": 0.5123941898345947, "rewards/rollout_reward_func/std": 0.7728413939476013, "sampling/importance_sampling_ratio/max": 1.5649579763412476, "sampling/importance_sampling_ratio/mean": 0.8384846448898315, "sampling/importance_sampling_ratio/min": 0.00015145163342822343, "sampling/sampling_logp_difference/max": 2.243901014328003, "sampling/sampling_logp_difference/mean": 0.24735915660858154, "step": 873, "step_time": 27.77652517502429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3767125606536865, "epoch": 0.01748, "grad_norm": 0.04871160537004471, "kl": 0.3026827350258827, "learning_rate": 7.998959701434357e-06, "loss": -0.0397, "step": 874, "step_time": 14.874186052009463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.65625, "completions/mean_terminated_length": 4.178571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.332348184660077, "epoch": 0.0175, "frac_reward_zero_std": 0.375, "grad_norm": 0.12324276566505432, "kl": 0.556504599750042, "learning_rate": 7.99895721728409e-06, "loss": -0.0193, "num_tokens": 22035182.0, "reward": 0.36418473720550537, "reward_std": 0.8325052857398987, "rewards/rollout_reward_func/mean": 0.36418473720550537, "rewards/rollout_reward_func/std": 0.8325052261352539, "sampling/importance_sampling_ratio/max": 1.2710076570510864, "sampling/importance_sampling_ratio/mean": 0.7977863550186157, "sampling/importance_sampling_ratio/min": 6.708206001349026e-06, "sampling/sampling_logp_difference/max": 2.1745197772979736, "sampling/sampling_logp_difference/mean": 0.2680322229862213, "step": 875, "step_time": 21.025865486008115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3363824151456356, "epoch": 0.01752, "grad_norm": 0.10545346140861511, "kl": 0.4551502652466297, "learning_rate": 7.998954730171898e-06, "loss": -0.02, "step": 876, "step_time": 11.163049127004342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.28125, "completions/mean_terminated_length": 4.566667079925537, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6325217187404633, "epoch": 0.01754, "frac_reward_zero_std": 0.125, "grad_norm": 0.15058879554271698, "kl": 0.3758406341075897, "learning_rate": 7.998952240097784e-06, "loss": -0.0256, "num_tokens": 22091861.0, "reward": 0.8172109127044678, "reward_std": 0.7503179907798767, "rewards/rollout_reward_func/mean": 0.8172109127044678, "rewards/rollout_reward_func/std": 0.7503179907798767, "sampling/importance_sampling_ratio/max": 1.3949910402297974, "sampling/importance_sampling_ratio/mean": 0.8886542320251465, "sampling/importance_sampling_ratio/min": 3.706883580889553e-05, "sampling/sampling_logp_difference/max": 1.8131027221679688, "sampling/sampling_logp_difference/mean": 0.2741304636001587, "step": 877, "step_time": 27.075730796990683 }, { "clip_ratio/high_max": 0.025252525694668293, "clip_ratio/high_mean": 0.012626262847334146, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012626262847334146, "entropy": 1.6498866081237793, "epoch": 0.01756, "grad_norm": 0.150237575173378, "kl": 0.33854543417692184, "learning_rate": 7.99894974706175e-06, "loss": -0.0264, "step": 878, "step_time": 13.898257427033968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.5, "completions/mean_terminated_length": 4.307692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8348776400089264, "epoch": 0.01758, "frac_reward_zero_std": 0.375, "grad_norm": 0.020762108266353607, "kl": 0.3872162811458111, "learning_rate": 7.998947251063797e-06, "loss": -0.0519, "num_tokens": 22139845.0, "reward": 0.6761025786399841, "reward_std": 0.9332374334335327, "rewards/rollout_reward_func/mean": 0.6761025786399841, "rewards/rollout_reward_func/std": 0.9332374334335327, "sampling/importance_sampling_ratio/max": 1.2973368167877197, "sampling/importance_sampling_ratio/mean": 0.7779029607772827, "sampling/importance_sampling_ratio/min": 4.457221791653865e-08, "sampling/sampling_logp_difference/max": 2.292520523071289, "sampling/sampling_logp_difference/mean": 0.3591827154159546, "step": 879, "step_time": 28.481937172007747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8441759049892426, "epoch": 0.0176, "grad_norm": 0.021416667848825455, "kl": 0.3670829087495804, "learning_rate": 7.99894475210393e-06, "loss": -0.0518, "step": 880, "step_time": 15.16786244197283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 9.40625, "completions/mean_terminated_length": 5.450000286102295, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.548595607280731, "epoch": 0.01762, "frac_reward_zero_std": 0.0, "grad_norm": 0.08058827370405197, "kl": 0.15114774741232395, "learning_rate": 7.998942250182149e-06, "loss": -0.0766, "num_tokens": 22205980.0, "reward": 0.1962215155363083, "reward_std": 0.8728079199790955, "rewards/rollout_reward_func/mean": 0.1962215155363083, "rewards/rollout_reward_func/std": 0.8728079199790955, "sampling/importance_sampling_ratio/max": 1.1878323554992676, "sampling/importance_sampling_ratio/mean": 0.45114243030548096, "sampling/importance_sampling_ratio/min": 1.793465344235301e-05, "sampling/sampling_logp_difference/max": 2.0994491577148438, "sampling/sampling_logp_difference/mean": 0.36064353585243225, "step": 881, "step_time": 32.236267406027764 }, { "clip_ratio/high_max": 0.014821272809058428, "clip_ratio/high_mean": 0.007410636404529214, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007410636404529214, "entropy": 2.555194139480591, "epoch": 0.01764, "grad_norm": 0.08879268914461136, "kl": 0.14520916901528835, "learning_rate": 7.998939745298458e-06, "loss": -0.0765, "step": 882, "step_time": 14.021643176005455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.224061757326126, "epoch": 0.01766, "frac_reward_zero_std": 0.125, "grad_norm": 0.035163503140211105, "kl": 0.5148035027086735, "learning_rate": 7.998937237452858e-06, "loss": -0.0677, "num_tokens": 22255465.0, "reward": 0.42554789781570435, "reward_std": 0.9483891129493713, "rewards/rollout_reward_func/mean": 0.42554789781570435, "rewards/rollout_reward_func/std": 0.9483891129493713, "sampling/importance_sampling_ratio/max": 1.1900877952575684, "sampling/importance_sampling_ratio/mean": 0.620192289352417, "sampling/importance_sampling_ratio/min": 3.0948694984545e-07, "sampling/sampling_logp_difference/max": 1.90084969997406, "sampling/sampling_logp_difference/mean": 0.36283010244369507, "step": 883, "step_time": 26.47223932700581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.22088485956192, "epoch": 0.01768, "grad_norm": 0.03527319058775902, "kl": 0.48265286162495613, "learning_rate": 7.998934726645356e-06, "loss": -0.0677, "step": 884, "step_time": 12.197209518024465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.615385055541992, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.119533360004425, "epoch": 0.0177, "frac_reward_zero_std": 0.0, "grad_norm": 0.11433341354131699, "kl": 0.3149260990321636, "learning_rate": 7.998932212875949e-06, "loss": -0.0813, "num_tokens": 22312994.0, "reward": 0.6597715616226196, "reward_std": 0.814160943031311, "rewards/rollout_reward_func/mean": 0.6597715616226196, "rewards/rollout_reward_func/std": 0.8141608834266663, "sampling/importance_sampling_ratio/max": 1.2151994705200195, "sampling/importance_sampling_ratio/mean": 0.6405022144317627, "sampling/importance_sampling_ratio/min": 1.3329974535736255e-05, "sampling/sampling_logp_difference/max": 2.039970636367798, "sampling/sampling_logp_difference/mean": 0.3401780128479004, "step": 885, "step_time": 25.93073724300484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.119465172290802, "epoch": 0.01772, "grad_norm": 0.10906080156564713, "kl": 0.32142362371087074, "learning_rate": 7.998929696144641e-06, "loss": -0.0816, "step": 886, "step_time": 12.80771491798805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.777777671813965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0192809998989105, "epoch": 0.01774, "frac_reward_zero_std": 0.125, "grad_norm": 0.07643667608499527, "kl": 0.2826278358697891, "learning_rate": 7.998927176451435e-06, "loss": -0.0222, "num_tokens": 22370812.0, "reward": 0.40456634759902954, "reward_std": 0.744246244430542, "rewards/rollout_reward_func/mean": 0.40456634759902954, "rewards/rollout_reward_func/std": 0.744246244430542, "sampling/importance_sampling_ratio/max": 1.5034823417663574, "sampling/importance_sampling_ratio/mean": 0.8051053285598755, "sampling/importance_sampling_ratio/min": 4.673554485634668e-07, "sampling/sampling_logp_difference/max": 2.195005416870117, "sampling/sampling_logp_difference/mean": 0.37348297238349915, "step": 887, "step_time": 28.88614701901679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.016717791557312, "epoch": 0.01776, "grad_norm": 0.07684139907360077, "kl": 0.2880295515060425, "learning_rate": 7.998924653796334e-06, "loss": -0.0222, "step": 888, "step_time": 14.845207662001485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.464409053325653, "epoch": 0.01778, "frac_reward_zero_std": 0.0, "grad_norm": 0.06323567032814026, "kl": 0.3598454222083092, "learning_rate": 7.99892212817934e-06, "loss": -0.0742, "num_tokens": 22421252.0, "reward": 0.6346477270126343, "reward_std": 0.8614195585250854, "rewards/rollout_reward_func/mean": 0.6346477270126343, "rewards/rollout_reward_func/std": 0.8614195585250854, "sampling/importance_sampling_ratio/max": 1.7125362157821655, "sampling/importance_sampling_ratio/mean": 0.8011000156402588, "sampling/importance_sampling_ratio/min": 0.0004159182426519692, "sampling/sampling_logp_difference/max": 1.9538036584854126, "sampling/sampling_logp_difference/mean": 0.2557521462440491, "step": 889, "step_time": 24.24710150100873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.466648519039154, "epoch": 0.0178, "grad_norm": 0.06515008956193924, "kl": 0.3647407703101635, "learning_rate": 7.998919599600457e-06, "loss": -0.0744, "step": 890, "step_time": 12.132935692963656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4978712350130081, "epoch": 0.01782, "frac_reward_zero_std": 0.125, "grad_norm": 0.1122710108757019, "kl": 0.2493177354335785, "learning_rate": 7.998917068059684e-06, "loss": -0.0685, "num_tokens": 22473602.0, "reward": 0.5171956419944763, "reward_std": 0.8599291443824768, "rewards/rollout_reward_func/mean": 0.5171956419944763, "rewards/rollout_reward_func/std": 0.8599291443824768, "sampling/importance_sampling_ratio/max": 1.311698079109192, "sampling/importance_sampling_ratio/mean": 0.8494666814804077, "sampling/importance_sampling_ratio/min": 0.0004069046990480274, "sampling/sampling_logp_difference/max": 2.232182502746582, "sampling/sampling_logp_difference/mean": 0.22371172904968262, "step": 891, "step_time": 25.65762302200892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4943458586931229, "epoch": 0.01784, "grad_norm": 0.11754997819662094, "kl": 0.24173838645219803, "learning_rate": 7.998914533557027e-06, "loss": -0.0692, "step": 892, "step_time": 12.307613621029304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 9.78125, "completions/mean_terminated_length": 6.523809432983398, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.9791187047958374, "epoch": 0.01786, "frac_reward_zero_std": 0.0, "grad_norm": 0.03651680797338486, "kl": 0.14293504133820534, "learning_rate": 7.998911996092486e-06, "loss": -0.0512, "num_tokens": 22539333.0, "reward": 0.2334323525428772, "reward_std": 0.8603338599205017, "rewards/rollout_reward_func/mean": 0.2334323525428772, "rewards/rollout_reward_func/std": 0.8603338003158569, "sampling/importance_sampling_ratio/max": 1.2302967309951782, "sampling/importance_sampling_ratio/mean": 0.4564949870109558, "sampling/importance_sampling_ratio/min": 1.1585252650547773e-06, "sampling/sampling_logp_difference/max": 2.0796852111816406, "sampling/sampling_logp_difference/mean": 0.42050158977508545, "step": 893, "step_time": 34.49381402099971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.978187322616577, "epoch": 0.01788, "grad_norm": 0.03408287838101387, "kl": 0.14864777028560638, "learning_rate": 7.998909455666065e-06, "loss": -0.0512, "step": 894, "step_time": 16.003362044983078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 5.133333683013916, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4172703810036182, "epoch": 0.0179, "frac_reward_zero_std": 0.375, "grad_norm": 0.1962641179561615, "kl": 0.5675617381930351, "learning_rate": 7.998906912277766e-06, "loss": -0.0391, "num_tokens": 22589692.0, "reward": 0.7889102697372437, "reward_std": 0.8993621468544006, "rewards/rollout_reward_func/mean": 0.7889102697372437, "rewards/rollout_reward_func/std": 0.8993620872497559, "sampling/importance_sampling_ratio/max": 1.4708822965621948, "sampling/importance_sampling_ratio/mean": 0.8398268222808838, "sampling/importance_sampling_ratio/min": 1.6508285625604913e-05, "sampling/sampling_logp_difference/max": 1.579804539680481, "sampling/sampling_logp_difference/mean": 0.2686467170715332, "step": 895, "step_time": 27.665509610960726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4306398704648018, "epoch": 0.01792, "grad_norm": 0.3411935269832611, "kl": 0.5253204181790352, "learning_rate": 7.998904365927591e-06, "loss": -0.0404, "step": 896, "step_time": 14.849372693977784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 5.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.4308528304100037, "epoch": 0.01794, "frac_reward_zero_std": 0.125, "grad_norm": 0.1213483214378357, "kl": 0.4100374672561884, "learning_rate": 7.998901816615544e-06, "loss": -0.0553, "num_tokens": 22651457.0, "reward": 0.3494962453842163, "reward_std": 0.9078482389450073, "rewards/rollout_reward_func/mean": 0.3494962453842163, "rewards/rollout_reward_func/std": 0.9078481793403625, "sampling/importance_sampling_ratio/max": 1.575693130493164, "sampling/importance_sampling_ratio/mean": 0.6755198836326599, "sampling/importance_sampling_ratio/min": 1.1563260159164201e-05, "sampling/sampling_logp_difference/max": 2.0975329875946045, "sampling/sampling_logp_difference/mean": 0.4049522876739502, "step": 897, "step_time": 29.64574786295998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0037878789007663727, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037878789007663727, "entropy": 2.4403499364852905, "epoch": 0.01796, "grad_norm": 0.11553837358951569, "kl": 0.4019184187054634, "learning_rate": 7.998899264341627e-06, "loss": -0.0555, "step": 898, "step_time": 13.93944437301252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.3214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4900591969490051, "epoch": 0.01798, "frac_reward_zero_std": 0.0, "grad_norm": 0.1718415468931198, "kl": 0.3656853288412094, "learning_rate": 7.99889670910584e-06, "loss": -0.057, "num_tokens": 22700598.0, "reward": 0.6456692218780518, "reward_std": 0.7368364334106445, "rewards/rollout_reward_func/mean": 0.6456692218780518, "rewards/rollout_reward_func/std": 0.7368363738059998, "sampling/importance_sampling_ratio/max": 1.362691879272461, "sampling/importance_sampling_ratio/mean": 0.7690780758857727, "sampling/importance_sampling_ratio/min": 0.0003925712371710688, "sampling/sampling_logp_difference/max": 2.091949462890625, "sampling/sampling_logp_difference/mean": 0.2307395339012146, "step": 899, "step_time": 25.805886141984956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4973083436489105, "epoch": 0.018, "grad_norm": 0.17671528458595276, "kl": 0.38898929581046104, "learning_rate": 7.99889415090819e-06, "loss": -0.0568, "step": 900, "step_time": 13.19880206201924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 6.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.289202779531479, "epoch": 0.01802, "frac_reward_zero_std": 0.0, "grad_norm": 0.10481826215982437, "kl": 0.34735599532723427, "learning_rate": 7.998891589748676e-06, "loss": -0.0906, "num_tokens": 22755037.0, "reward": 0.344631552696228, "reward_std": 0.872456431388855, "rewards/rollout_reward_func/mean": 0.344631552696228, "rewards/rollout_reward_func/std": 0.872456431388855, "sampling/importance_sampling_ratio/max": 1.477421760559082, "sampling/importance_sampling_ratio/mean": 0.6955140829086304, "sampling/importance_sampling_ratio/min": 8.351423552710457e-09, "sampling/sampling_logp_difference/max": 2.1664304733276367, "sampling/sampling_logp_difference/mean": 0.3922484815120697, "step": 901, "step_time": 25.52974820995587 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012442129664123058, "entropy": 2.280473977327347, "epoch": 0.01804, "grad_norm": 0.05054508522152901, "kl": 0.34448742121458054, "learning_rate": 7.998889025627303e-06, "loss": -0.091, "step": 902, "step_time": 13.074400372977834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.55845108628273, "epoch": 0.01806, "frac_reward_zero_std": 0.0, "grad_norm": 0.23507320880889893, "kl": 0.22755446285009384, "learning_rate": 7.99888645854407e-06, "loss": -0.0731, "num_tokens": 22812481.0, "reward": 0.4181972146034241, "reward_std": 0.7611493468284607, "rewards/rollout_reward_func/mean": 0.4181972146034241, "rewards/rollout_reward_func/std": 0.7611494064331055, "sampling/importance_sampling_ratio/max": 1.7337448596954346, "sampling/importance_sampling_ratio/mean": 0.7178665399551392, "sampling/importance_sampling_ratio/min": 1.0052473271571216e-06, "sampling/sampling_logp_difference/max": 1.9683938026428223, "sampling/sampling_logp_difference/mean": 0.38498497009277344, "step": 903, "step_time": 26.37314503901871 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 2.5283737778663635, "epoch": 0.01808, "grad_norm": 0.1901523768901825, "kl": 0.2501339688897133, "learning_rate": 7.998883888498983e-06, "loss": -0.0747, "step": 904, "step_time": 12.723803435015725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.407407283782959, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5251443684101105, "epoch": 0.0181, "frac_reward_zero_std": 0.0, "grad_norm": 0.06199756637215614, "kl": 0.221120472997427, "learning_rate": 7.998881315492043e-06, "loss": -0.0613, "num_tokens": 22863175.0, "reward": 0.882330060005188, "reward_std": 0.7837002277374268, "rewards/rollout_reward_func/mean": 0.882330060005188, "rewards/rollout_reward_func/std": 0.7837002277374268, "sampling/importance_sampling_ratio/max": 1.4126689434051514, "sampling/importance_sampling_ratio/mean": 0.8480911254882812, "sampling/importance_sampling_ratio/min": 4.9084770580520853e-05, "sampling/sampling_logp_difference/max": 1.9268009662628174, "sampling/sampling_logp_difference/mean": 0.2598744034767151, "step": 905, "step_time": 25.37784207399818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0035714285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035714285913854837, "entropy": 1.5067530274391174, "epoch": 0.01812, "grad_norm": 0.05159585550427437, "kl": 0.2341966349631548, "learning_rate": 7.998878739523254e-06, "loss": -0.0612, "step": 906, "step_time": 12.447395563998725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 5.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.577065736055374, "epoch": 0.01814, "frac_reward_zero_std": 0.125, "grad_norm": 0.13085702061653137, "kl": 0.208875123411417, "learning_rate": 7.998876160592615e-06, "loss": -0.0594, "num_tokens": 22913732.0, "reward": 0.3575819730758667, "reward_std": 0.8785123825073242, "rewards/rollout_reward_func/mean": 0.3575819730758667, "rewards/rollout_reward_func/std": 0.878512442111969, "sampling/importance_sampling_ratio/max": 1.5604240894317627, "sampling/importance_sampling_ratio/mean": 0.5322731733322144, "sampling/importance_sampling_ratio/min": 1.6841305239267967e-08, "sampling/sampling_logp_difference/max": 2.136876106262207, "sampling/sampling_logp_difference/mean": 0.44658195972442627, "step": 907, "step_time": 28.088072717975592 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.007305195089429617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015117695089429617, "entropy": 2.5573437809944153, "epoch": 0.01816, "grad_norm": 0.10031460970640182, "kl": 0.2180478647351265, "learning_rate": 7.998873578700133e-06, "loss": -0.0604, "step": 908, "step_time": 12.748710521962494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 4.34615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6712634861469269, "epoch": 0.01818, "frac_reward_zero_std": 0.25, "grad_norm": 0.07425491511821747, "kl": 0.3008932415395975, "learning_rate": 7.998870993845807e-06, "loss": -0.0631, "num_tokens": 22964820.0, "reward": 0.25957363843917847, "reward_std": 0.7500061988830566, "rewards/rollout_reward_func/mean": 0.25957363843917847, "rewards/rollout_reward_func/std": 0.7500061392784119, "sampling/importance_sampling_ratio/max": 1.2746566534042358, "sampling/importance_sampling_ratio/mean": 0.776003897190094, "sampling/importance_sampling_ratio/min": 2.2930009890842484e-06, "sampling/sampling_logp_difference/max": 1.8415597677230835, "sampling/sampling_logp_difference/mean": 0.25472554564476013, "step": 909, "step_time": 25.282755113003077 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062499860301614, "entropy": 1.6417014449834824, "epoch": 0.0182, "grad_norm": 0.06473524123430252, "kl": 0.34882691875100136, "learning_rate": 7.99886840602964e-06, "loss": -0.0635, "step": 910, "step_time": 12.7148666610301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.866666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2342073917388916, "epoch": 0.01822, "frac_reward_zero_std": 0.375, "grad_norm": 0.1233568787574768, "kl": 0.35814177989959717, "learning_rate": 7.998865815251639e-06, "loss": -0.0293, "num_tokens": 23009791.0, "reward": 0.47409749031066895, "reward_std": 0.882264256477356, "rewards/rollout_reward_func/mean": 0.47409749031066895, "rewards/rollout_reward_func/std": 0.8822641968727112, "sampling/importance_sampling_ratio/max": 1.637340784072876, "sampling/importance_sampling_ratio/mean": 0.867040753364563, "sampling/importance_sampling_ratio/min": 3.1139683414949104e-05, "sampling/sampling_logp_difference/max": 2.123037099838257, "sampling/sampling_logp_difference/mean": 0.2297658622264862, "step": 911, "step_time": 23.068620426987763 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.2212228327989578, "epoch": 0.01824, "grad_norm": 0.09512689709663391, "kl": 0.3921920135617256, "learning_rate": 7.998863221511802e-06, "loss": -0.0299, "step": 912, "step_time": 11.816693689004751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.548387050628662, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9685063119977713, "epoch": 0.01826, "frac_reward_zero_std": 0.375, "grad_norm": 0.11177025735378265, "kl": 0.45656726136803627, "learning_rate": 7.998860624810133e-06, "loss": -0.036, "num_tokens": 23056849.0, "reward": 0.8504214286804199, "reward_std": 0.7802199125289917, "rewards/rollout_reward_func/mean": 0.8504214286804199, "rewards/rollout_reward_func/std": 0.7802199125289917, "sampling/importance_sampling_ratio/max": 1.5436619520187378, "sampling/importance_sampling_ratio/mean": 0.9008122682571411, "sampling/importance_sampling_ratio/min": 2.3805494038242614e-06, "sampling/sampling_logp_difference/max": 1.997868537902832, "sampling/sampling_logp_difference/mean": 0.21847259998321533, "step": 913, "step_time": 22.815139319980517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9576263315975666, "epoch": 0.01828, "grad_norm": 0.09810028970241547, "kl": 0.45925547927618027, "learning_rate": 7.998858025146634e-06, "loss": -0.0363, "step": 914, "step_time": 12.265109932021005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.5, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.09393110871315, "epoch": 0.0183, "frac_reward_zero_std": 0.5, "grad_norm": 0.06303230673074722, "kl": 0.5108343474566936, "learning_rate": 7.998855422521307e-06, "loss": -0.0473, "num_tokens": 23103549.0, "reward": 0.530815839767456, "reward_std": 0.9264352917671204, "rewards/rollout_reward_func/mean": 0.530815839767456, "rewards/rollout_reward_func/std": 0.9264352917671204, "sampling/importance_sampling_ratio/max": 1.1962662935256958, "sampling/importance_sampling_ratio/mean": 0.8606502413749695, "sampling/importance_sampling_ratio/min": 1.0982749643062562e-07, "sampling/sampling_logp_difference/max": 2.7874131202697754, "sampling/sampling_logp_difference/mean": 0.23994295299053192, "step": 915, "step_time": 22.78201106900815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0912137888371944, "epoch": 0.01832, "grad_norm": 0.060894325375556946, "kl": 0.4975651428103447, "learning_rate": 7.998852816934154e-06, "loss": -0.0474, "step": 916, "step_time": 10.988366121979197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3054777104407549, "epoch": 0.01834, "frac_reward_zero_std": 0.25, "grad_norm": 0.16800495982170105, "kl": 0.2446563057601452, "learning_rate": 7.998850208385183e-06, "loss": 0.0022, "num_tokens": 23154021.0, "reward": 0.3567284047603607, "reward_std": 0.65027916431427, "rewards/rollout_reward_func/mean": 0.3567284047603607, "rewards/rollout_reward_func/std": 0.6502792239189148, "sampling/importance_sampling_ratio/max": 1.207997441291809, "sampling/importance_sampling_ratio/mean": 1.0168757438659668, "sampling/importance_sampling_ratio/min": 0.0036026921588927507, "sampling/sampling_logp_difference/max": 0.8015204668045044, "sampling/sampling_logp_difference/mean": 0.054353199899196625, "step": 917, "step_time": 25.065020876005292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3052009306848049, "epoch": 0.01836, "grad_norm": 0.16334283351898193, "kl": 0.24539417773485184, "learning_rate": 7.99884759687439e-06, "loss": 0.0015, "step": 918, "step_time": 14.246869701019023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.21880137361586094, "epoch": 0.01838, "frac_reward_zero_std": 0.75, "grad_norm": 0.0022224988788366318, "kl": 0.26414303109049797, "learning_rate": 7.998844982401781e-06, "loss": -0.0071, "num_tokens": 23201929.0, "reward": 0.9801011085510254, "reward_std": 0.5008237957954407, "rewards/rollout_reward_func/mean": 0.9801011085510254, "rewards/rollout_reward_func/std": 0.5008237957954407, "sampling/importance_sampling_ratio/max": 1.1327146291732788, "sampling/importance_sampling_ratio/mean": 1.023324966430664, "sampling/importance_sampling_ratio/min": 0.019592225551605225, "sampling/sampling_logp_difference/max": 0.7544440031051636, "sampling/sampling_logp_difference/mean": 0.03825557604432106, "step": 919, "step_time": 25.214485290984157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2221900187432766, "epoch": 0.0184, "grad_norm": 0.002269661519676447, "kl": 0.2640026733279228, "learning_rate": 7.998842364967358e-06, "loss": -0.0071, "step": 920, "step_time": 14.407491338002728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.03125, "completions/mean_terminated_length": 4.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2991204494610429, "epoch": 0.01842, "frac_reward_zero_std": 0.375, "grad_norm": 0.09197578579187393, "kl": 0.8017100393772125, "learning_rate": 7.998839744571121e-06, "loss": -0.0149, "num_tokens": 23254298.0, "reward": 1.0868446826934814, "reward_std": 0.5597075819969177, "rewards/rollout_reward_func/mean": 1.0868446826934814, "rewards/rollout_reward_func/std": 0.559707522392273, "sampling/importance_sampling_ratio/max": 1.5047636032104492, "sampling/importance_sampling_ratio/mean": 0.9856386780738831, "sampling/importance_sampling_ratio/min": 0.06719456613063812, "sampling/sampling_logp_difference/max": 1.5023751258850098, "sampling/sampling_logp_difference/mean": 0.07802776992321014, "step": 921, "step_time": 21.991757976007648 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 0.301967017352581, "epoch": 0.01844, "grad_norm": 0.09126047790050507, "kl": 0.7424473389983177, "learning_rate": 7.998837121213076e-06, "loss": -0.0154, "step": 922, "step_time": 12.11964545401861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6146706342697144, "epoch": 0.01846, "frac_reward_zero_std": 0.375, "grad_norm": 0.054592572152614594, "kl": 0.45973945036530495, "learning_rate": 7.998834494893225e-06, "loss": -0.054, "num_tokens": 23305873.0, "reward": 0.8688297271728516, "reward_std": 0.8184223175048828, "rewards/rollout_reward_func/mean": 0.8688297271728516, "rewards/rollout_reward_func/std": 0.8184223175048828, "sampling/importance_sampling_ratio/max": 1.166696548461914, "sampling/importance_sampling_ratio/mean": 0.8265382051467896, "sampling/importance_sampling_ratio/min": 3.763841959880665e-06, "sampling/sampling_logp_difference/max": 2.0390841960906982, "sampling/sampling_logp_difference/mean": 0.2969048321247101, "step": 923, "step_time": 22.964920623984654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6120158284902573, "epoch": 0.01848, "grad_norm": 0.03535613790154457, "kl": 0.4221835173666477, "learning_rate": 7.99883186561157e-06, "loss": -0.0541, "step": 924, "step_time": 11.869579574995441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0273659667000175, "epoch": 0.0185, "frac_reward_zero_std": 0.5, "grad_norm": 0.051629021763801575, "kl": 0.4163416847586632, "learning_rate": 7.998829233368114e-06, "loss": -0.0245, "num_tokens": 23352730.0, "reward": 0.21385931968688965, "reward_std": 0.7520698308944702, "rewards/rollout_reward_func/mean": 0.21385931968688965, "rewards/rollout_reward_func/std": 0.7520697712898254, "sampling/importance_sampling_ratio/max": 1.3485100269317627, "sampling/importance_sampling_ratio/mean": 0.8855186104774475, "sampling/importance_sampling_ratio/min": 2.3436412277533236e-07, "sampling/sampling_logp_difference/max": 2.335508108139038, "sampling/sampling_logp_difference/mean": 0.2237846553325653, "step": 925, "step_time": 25.179587441001786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.035132105462253, "epoch": 0.01852, "grad_norm": 0.052415549755096436, "kl": 0.39927004277706146, "learning_rate": 7.998826598162861e-06, "loss": -0.0247, "step": 926, "step_time": 13.662615972047206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 4.8125, "completions/mean_terminated_length": 4.451612949371338, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8512910827994347, "epoch": 0.01854, "frac_reward_zero_std": 0.125, "grad_norm": 0.10288552194833755, "kl": 1.2215748503804207, "learning_rate": 7.998823959995809e-06, "loss": -0.0284, "num_tokens": 23410442.0, "reward": 0.6086004972457886, "reward_std": 0.855540931224823, "rewards/rollout_reward_func/mean": 0.6086004972457886, "rewards/rollout_reward_func/std": 0.8555408716201782, "sampling/importance_sampling_ratio/max": 1.169433832168579, "sampling/importance_sampling_ratio/mean": 0.8856421113014221, "sampling/importance_sampling_ratio/min": 0.0002232017577625811, "sampling/sampling_logp_difference/max": 1.5112059116363525, "sampling/sampling_logp_difference/mean": 0.16690325736999512, "step": 927, "step_time": 26.78653878599289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8560119867324829, "epoch": 0.01856, "grad_norm": 0.09604748338460922, "kl": 1.0983827151358128, "learning_rate": 7.998821318866964e-06, "loss": -0.0286, "step": 928, "step_time": 13.813500702002784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.15625, "completions/mean_terminated_length": 4.433333396911621, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.018993191421032, "epoch": 0.01858, "frac_reward_zero_std": 0.375, "grad_norm": 0.06729699671268463, "kl": 0.4855296015739441, "learning_rate": 7.998818674776329e-06, "loss": -0.0515, "num_tokens": 23464379.0, "reward": 0.8729350566864014, "reward_std": 0.7367404699325562, "rewards/rollout_reward_func/mean": 0.8729350566864014, "rewards/rollout_reward_func/std": 0.7367405295372009, "sampling/importance_sampling_ratio/max": 1.1980994939804077, "sampling/importance_sampling_ratio/mean": 0.8545240163803101, "sampling/importance_sampling_ratio/min": 0.0001366191281704232, "sampling/sampling_logp_difference/max": 1.5143545866012573, "sampling/sampling_logp_difference/mean": 0.17389243841171265, "step": 929, "step_time": 27.640148395934375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.026109240949154, "epoch": 0.0186, "grad_norm": 0.05376217141747475, "kl": 0.424501396715641, "learning_rate": 7.998816027723906e-06, "loss": -0.0517, "step": 930, "step_time": 14.49546433397336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.59375, "completions/mean_terminated_length": 5.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2426210045814514, "epoch": 0.01862, "frac_reward_zero_std": 0.0, "grad_norm": 0.12423199415206909, "kl": 0.1879965141415596, "learning_rate": 7.998813377709697e-06, "loss": -0.0749, "num_tokens": 23524751.0, "reward": 0.210963636636734, "reward_std": 0.8505238890647888, "rewards/rollout_reward_func/mean": 0.210963636636734, "rewards/rollout_reward_func/std": 0.8505238890647888, "sampling/importance_sampling_ratio/max": 1.567151427268982, "sampling/importance_sampling_ratio/mean": 0.7196453213691711, "sampling/importance_sampling_ratio/min": 1.225739651999902e-06, "sampling/sampling_logp_difference/max": 2.037693738937378, "sampling/sampling_logp_difference/mean": 0.3328397274017334, "step": 931, "step_time": 27.05777559100534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.251105546951294, "epoch": 0.01864, "grad_norm": 0.11863663047552109, "kl": 0.18863008730113506, "learning_rate": 7.998810724733703e-06, "loss": -0.0751, "step": 932, "step_time": 12.958370337029919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3948394283652306, "epoch": 0.01866, "frac_reward_zero_std": 0.0, "grad_norm": 0.10714467614889145, "kl": 0.4832172468304634, "learning_rate": 7.99880806879593e-06, "loss": -0.0465, "num_tokens": 23579590.0, "reward": 0.5082764029502869, "reward_std": 0.7976247668266296, "rewards/rollout_reward_func/mean": 0.5082764029502869, "rewards/rollout_reward_func/std": 0.7976247072219849, "sampling/importance_sampling_ratio/max": 1.3776187896728516, "sampling/importance_sampling_ratio/mean": 0.8522911071777344, "sampling/importance_sampling_ratio/min": 1.0610446565806342e-07, "sampling/sampling_logp_difference/max": 2.2595937252044678, "sampling/sampling_logp_difference/mean": 0.2871597409248352, "step": 933, "step_time": 25.1386930779845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4166024029254913, "epoch": 0.01868, "grad_norm": 0.13363026082515717, "kl": 0.47601062059402466, "learning_rate": 7.998805409896381e-06, "loss": -0.0472, "step": 934, "step_time": 12.154004688025452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0408170744776726, "epoch": 0.0187, "frac_reward_zero_std": 0.25, "grad_norm": 0.10250511020421982, "kl": 0.23228824138641357, "learning_rate": 7.998802748035054e-06, "loss": -0.0192, "num_tokens": 23630088.0, "reward": 0.807683527469635, "reward_std": 0.7803139686584473, "rewards/rollout_reward_func/mean": 0.807683527469635, "rewards/rollout_reward_func/std": 0.7803139686584473, "sampling/importance_sampling_ratio/max": 1.4388501644134521, "sampling/importance_sampling_ratio/mean": 0.9402036070823669, "sampling/importance_sampling_ratio/min": 6.682916864519939e-05, "sampling/sampling_logp_difference/max": 1.4328241348266602, "sampling/sampling_logp_difference/mean": 0.16012871265411377, "step": 935, "step_time": 23.667907057970297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.068315289914608, "epoch": 0.01872, "grad_norm": 0.10744459182024002, "kl": 0.22645829990506172, "learning_rate": 7.998800083211955e-06, "loss": -0.0197, "step": 936, "step_time": 12.207521281030495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.600000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4398867301642895, "epoch": 0.01874, "frac_reward_zero_std": 0.25, "grad_norm": 0.08545362204313278, "kl": 0.3397102653980255, "learning_rate": 7.998797415427088e-06, "loss": -0.0502, "num_tokens": 23679767.0, "reward": 0.39029228687286377, "reward_std": 0.7760810852050781, "rewards/rollout_reward_func/mean": 0.39029228687286377, "rewards/rollout_reward_func/std": 0.7760810256004333, "sampling/importance_sampling_ratio/max": 1.1709293127059937, "sampling/importance_sampling_ratio/mean": 0.7929996252059937, "sampling/importance_sampling_ratio/min": 5.7613149692770094e-05, "sampling/sampling_logp_difference/max": 2.000636577606201, "sampling/sampling_logp_difference/mean": 0.23538519442081451, "step": 937, "step_time": 24.472036843013484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4633743986487389, "epoch": 0.01876, "grad_norm": 0.08558237552642822, "kl": 0.301720529794693, "learning_rate": 7.998794744680451e-06, "loss": -0.0503, "step": 938, "step_time": 12.87991373098339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 5.344827651977539, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6437419652938843, "epoch": 0.01878, "frac_reward_zero_std": 0.25, "grad_norm": 0.033792462199926376, "kl": 0.28924859687685966, "learning_rate": 7.99879207097205e-06, "loss": -0.0523, "num_tokens": 23729082.0, "reward": 0.8218448162078857, "reward_std": 0.8205535411834717, "rewards/rollout_reward_func/mean": 0.8218448162078857, "rewards/rollout_reward_func/std": 0.8205536007881165, "sampling/importance_sampling_ratio/max": 1.1786720752716064, "sampling/importance_sampling_ratio/mean": 0.8280671238899231, "sampling/importance_sampling_ratio/min": 3.0136332043184666e-06, "sampling/sampling_logp_difference/max": 1.9529175758361816, "sampling/sampling_logp_difference/mean": 0.29391562938690186, "step": 939, "step_time": 26.50812999700429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6586491018533707, "epoch": 0.0188, "grad_norm": 0.03291725367307663, "kl": 0.272922083735466, "learning_rate": 7.998789394301887e-06, "loss": -0.0524, "step": 940, "step_time": 14.150579873996321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.8125, "completions/mean_terminated_length": 5.519999980926514, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2569572627544403, "epoch": 0.01882, "frac_reward_zero_std": 0.125, "grad_norm": 0.10287985950708389, "kl": 0.15176107734441757, "learning_rate": 7.998786714669967e-06, "loss": -0.048, "num_tokens": 23788569.0, "reward": 0.3632175326347351, "reward_std": 0.7641940712928772, "rewards/rollout_reward_func/mean": 0.3632175326347351, "rewards/rollout_reward_func/std": 0.764194130897522, "sampling/importance_sampling_ratio/max": 1.5052218437194824, "sampling/importance_sampling_ratio/mean": 0.552538275718689, "sampling/importance_sampling_ratio/min": 1.5726167475804687e-05, "sampling/sampling_logp_difference/max": 2.3015103340148926, "sampling/sampling_logp_difference/mean": 0.3650004267692566, "step": 941, "step_time": 29.91416994202882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2806125581264496, "epoch": 0.01884, "grad_norm": 0.1037280261516571, "kl": 0.15374616347253323, "learning_rate": 7.998784032076289e-06, "loss": -0.0478, "step": 942, "step_time": 14.708937119983602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.3214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3494454324245453, "epoch": 0.01886, "frac_reward_zero_std": 0.375, "grad_norm": 0.0990288183093071, "kl": 0.5818279646337032, "learning_rate": 7.998781346520855e-06, "loss": -0.0452, "num_tokens": 23838497.0, "reward": 0.9670578241348267, "reward_std": 0.7726932168006897, "rewards/rollout_reward_func/mean": 0.9670578241348267, "rewards/rollout_reward_func/std": 0.7726932168006897, "sampling/importance_sampling_ratio/max": 1.4113507270812988, "sampling/importance_sampling_ratio/mean": 0.8731598854064941, "sampling/importance_sampling_ratio/min": 7.429606739606243e-06, "sampling/sampling_logp_difference/max": 1.926009178161621, "sampling/sampling_logp_difference/mean": 0.1935863345861435, "step": 943, "step_time": 26.53107710299082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3494539856910706, "epoch": 0.01888, "grad_norm": 0.09701445698738098, "kl": 0.5335769839584827, "learning_rate": 7.998778658003673e-06, "loss": -0.0457, "step": 944, "step_time": 13.0771083960135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.5625, "completions/mean_terminated_length": 5.199999809265137, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3627678751945496, "epoch": 0.0189, "frac_reward_zero_std": 0.0, "grad_norm": 0.13095833361148834, "kl": 0.24724240973591805, "learning_rate": 7.99877596652474e-06, "loss": -0.0915, "num_tokens": 23899010.0, "reward": 0.03835281729698181, "reward_std": 0.8647377490997314, "rewards/rollout_reward_func/mean": 0.03835281729698181, "rewards/rollout_reward_func/std": 0.8647376894950867, "sampling/importance_sampling_ratio/max": 1.2309879064559937, "sampling/importance_sampling_ratio/mean": 0.6096822023391724, "sampling/importance_sampling_ratio/min": 8.407341738347895e-06, "sampling/sampling_logp_difference/max": 1.709612488746643, "sampling/sampling_logp_difference/mean": 0.3574637472629547, "step": 945, "step_time": 31.538520577014424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3508493304252625, "epoch": 0.01892, "grad_norm": 0.12063201516866684, "kl": 0.2536473125219345, "learning_rate": 7.99877327208406e-06, "loss": -0.092, "step": 946, "step_time": 15.256323058973067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5910914838314056, "epoch": 0.01894, "frac_reward_zero_std": 0.125, "grad_norm": 0.055721599608659744, "kl": 0.27257465198636055, "learning_rate": 7.998770574681638e-06, "loss": -0.0486, "num_tokens": 23953755.0, "reward": 0.7140508890151978, "reward_std": 0.7871055603027344, "rewards/rollout_reward_func/mean": 0.7140508890151978, "rewards/rollout_reward_func/std": 0.7871055603027344, "sampling/importance_sampling_ratio/max": 1.373635172843933, "sampling/importance_sampling_ratio/mean": 0.808928370475769, "sampling/importance_sampling_ratio/min": 1.3951360244846e-07, "sampling/sampling_logp_difference/max": 2.1101531982421875, "sampling/sampling_logp_difference/mean": 0.26656103134155273, "step": 947, "step_time": 27.141418586019427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5751089453697205, "epoch": 0.01896, "grad_norm": 0.05137735232710838, "kl": 0.28768928349018097, "learning_rate": 7.998767874317474e-06, "loss": -0.0486, "step": 948, "step_time": 15.030326859035995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 5.34615421295166, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.194556385278702, "epoch": 0.01898, "frac_reward_zero_std": 0.25, "grad_norm": 0.08311603218317032, "kl": 0.29980818554759026, "learning_rate": 7.998765170991574e-06, "loss": -0.0403, "num_tokens": 24007162.0, "reward": 0.30999648571014404, "reward_std": 0.873345136642456, "rewards/rollout_reward_func/mean": 0.30999648571014404, "rewards/rollout_reward_func/std": 0.8733451962471008, "sampling/importance_sampling_ratio/max": 1.0708599090576172, "sampling/importance_sampling_ratio/mean": 0.5838338136672974, "sampling/importance_sampling_ratio/min": 3.63849289897189e-06, "sampling/sampling_logp_difference/max": 2.085449695587158, "sampling/sampling_logp_difference/mean": 0.3873281478881836, "step": 949, "step_time": 28.1164653020096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.186912566423416, "epoch": 0.019, "grad_norm": 0.08063726127147675, "kl": 0.2900832165032625, "learning_rate": 7.998762464703938e-06, "loss": -0.0406, "step": 950, "step_time": 14.490695283020614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.214285850524902, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5300891399383545, "epoch": 0.01902, "frac_reward_zero_std": 0.25, "grad_norm": 0.11394044756889343, "kl": 0.27436991035938263, "learning_rate": 7.998759755454567e-06, "loss": -0.0487, "num_tokens": 24062980.0, "reward": 0.41838449239730835, "reward_std": 0.7910724878311157, "rewards/rollout_reward_func/mean": 0.41838449239730835, "rewards/rollout_reward_func/std": 0.7910724878311157, "sampling/importance_sampling_ratio/max": 1.231461524963379, "sampling/importance_sampling_ratio/mean": 0.8431971073150635, "sampling/importance_sampling_ratio/min": 1.3371710849696683e-07, "sampling/sampling_logp_difference/max": 2.033970355987549, "sampling/sampling_logp_difference/mean": 0.30426305532455444, "step": 951, "step_time": 24.88836392504163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5298770070075989, "epoch": 0.01904, "grad_norm": 0.11516190320253372, "kl": 0.2829859182238579, "learning_rate": 7.998757043243469e-06, "loss": -0.049, "step": 952, "step_time": 12.943789299024502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.828689455986023, "epoch": 0.01906, "frac_reward_zero_std": 0.125, "grad_norm": 0.1033538281917572, "kl": 0.2631242796778679, "learning_rate": 7.99875432807064e-06, "loss": -0.0385, "num_tokens": 24110255.0, "reward": 0.5784745812416077, "reward_std": 0.9073578715324402, "rewards/rollout_reward_func/mean": 0.5784745812416077, "rewards/rollout_reward_func/std": 0.907357931137085, "sampling/importance_sampling_ratio/max": 1.3178486824035645, "sampling/importance_sampling_ratio/mean": 0.7417441606521606, "sampling/importance_sampling_ratio/min": 4.0880863139136636e-08, "sampling/sampling_logp_difference/max": 2.167773485183716, "sampling/sampling_logp_difference/mean": 0.3299628496170044, "step": 953, "step_time": 26.95730691900826 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.8327074944972992, "epoch": 0.01908, "grad_norm": 0.049933671951293945, "kl": 0.27148624509572983, "learning_rate": 7.99875160993609e-06, "loss": -0.0388, "step": 954, "step_time": 14.612852099031443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.043478488922119, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.075606733560562, "epoch": 0.0191, "frac_reward_zero_std": 0.125, "grad_norm": 0.12219936400651932, "kl": 0.28445854783058167, "learning_rate": 7.998748888839817e-06, "loss": -0.0647, "num_tokens": 24167238.0, "reward": 0.1527448445558548, "reward_std": 0.7757406234741211, "rewards/rollout_reward_func/mean": 0.1527448445558548, "rewards/rollout_reward_func/std": 0.7757406234741211, "sampling/importance_sampling_ratio/max": 1.3011504411697388, "sampling/importance_sampling_ratio/mean": 0.6985040903091431, "sampling/importance_sampling_ratio/min": 3.79163139996308e-07, "sampling/sampling_logp_difference/max": 1.9461803436279297, "sampling/sampling_logp_difference/mean": 0.3856941759586334, "step": 955, "step_time": 25.31985772997723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0843839943408966, "epoch": 0.01912, "grad_norm": 0.11361810564994812, "kl": 0.2780945710837841, "learning_rate": 7.998746164781823e-06, "loss": -0.0651, "step": 956, "step_time": 12.396545582014369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.375, "completions/mean_terminated_length": 5.384615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.50755974650383, "epoch": 0.01914, "frac_reward_zero_std": 0.0, "grad_norm": 0.12026838213205338, "kl": 0.24265097081661224, "learning_rate": 7.998743437762114e-06, "loss": -0.0731, "num_tokens": 24223805.0, "reward": 0.6965856552124023, "reward_std": 0.8171470761299133, "rewards/rollout_reward_func/mean": 0.6965856552124023, "rewards/rollout_reward_func/std": 0.8171470761299133, "sampling/importance_sampling_ratio/max": 1.233929991722107, "sampling/importance_sampling_ratio/mean": 0.6560267210006714, "sampling/importance_sampling_ratio/min": 6.088664150638579e-08, "sampling/sampling_logp_difference/max": 2.721024513244629, "sampling/sampling_logp_difference/mean": 0.4382055699825287, "step": 957, "step_time": 28.72980898496462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.499261438846588, "epoch": 0.01916, "grad_norm": 0.11600510776042938, "kl": 0.24874714016914368, "learning_rate": 7.99874070778069e-06, "loss": -0.0736, "step": 958, "step_time": 14.111411057994701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.625, "completions/mean_terminated_length": 4.761904716491699, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.32236909866333, "epoch": 0.01918, "frac_reward_zero_std": 0.375, "grad_norm": 0.05143306031823158, "kl": 0.24623797833919525, "learning_rate": 7.998737974837554e-06, "loss": -0.0466, "num_tokens": 24281821.0, "reward": 0.5776468515396118, "reward_std": 0.9829169511795044, "rewards/rollout_reward_func/mean": 0.5776468515396118, "rewards/rollout_reward_func/std": 0.9829169511795044, "sampling/importance_sampling_ratio/max": 1.1641623973846436, "sampling/importance_sampling_ratio/mean": 0.6359869241714478, "sampling/importance_sampling_ratio/min": 2.8996091714361683e-06, "sampling/sampling_logp_difference/max": 2.6193416118621826, "sampling/sampling_logp_difference/mean": 0.32587432861328125, "step": 959, "step_time": 35.12839938700199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.316939353942871, "epoch": 0.0192, "grad_norm": 0.0476246252655983, "kl": 0.23369098454713821, "learning_rate": 7.998735238932711e-06, "loss": -0.0469, "step": 960, "step_time": 16.60828847499215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0037878789007663727, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037878789007663727, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.25, "completions/mean_terminated_length": 5.629629611968994, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9024956226348877, "epoch": 0.01922, "frac_reward_zero_std": 0.125, "grad_norm": 0.19082623720169067, "kl": 0.2180456705391407, "learning_rate": 7.998732500066162e-06, "loss": -0.0585, "num_tokens": 24333243.0, "reward": 0.5085757970809937, "reward_std": 0.9572108387947083, "rewards/rollout_reward_func/mean": 0.5085757970809937, "rewards/rollout_reward_func/std": 0.9572108387947083, "sampling/importance_sampling_ratio/max": 1.2413115501403809, "sampling/importance_sampling_ratio/mean": 0.6149137020111084, "sampling/importance_sampling_ratio/min": 0.0001424611109541729, "sampling/sampling_logp_difference/max": 1.8849642276763916, "sampling/sampling_logp_difference/mean": 0.29724088311195374, "step": 961, "step_time": 26.544171332003316 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.8790870010852814, "epoch": 0.01924, "grad_norm": 0.11966454237699509, "kl": 0.22025081515312195, "learning_rate": 7.998729758237909e-06, "loss": -0.059, "step": 962, "step_time": 12.600843861990143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7527189403772354, "epoch": 0.01926, "frac_reward_zero_std": 0.25, "grad_norm": 0.07342271506786346, "kl": 0.2911675348877907, "learning_rate": 7.998727013447957e-06, "loss": -0.0616, "num_tokens": 24384708.0, "reward": 0.47072136402130127, "reward_std": 0.9082739353179932, "rewards/rollout_reward_func/mean": 0.47072136402130127, "rewards/rollout_reward_func/std": 0.9082739353179932, "sampling/importance_sampling_ratio/max": 1.3611652851104736, "sampling/importance_sampling_ratio/mean": 0.7754674553871155, "sampling/importance_sampling_ratio/min": 2.983930698974291e-06, "sampling/sampling_logp_difference/max": 1.6413826942443848, "sampling/sampling_logp_difference/mean": 0.29186320304870605, "step": 963, "step_time": 23.938612186990213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7307114452123642, "epoch": 0.01928, "grad_norm": 0.06461644917726517, "kl": 0.2998809814453125, "learning_rate": 7.998724265696306e-06, "loss": -0.062, "step": 964, "step_time": 11.261791712982813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.71875, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1073662005364895, "epoch": 0.0193, "frac_reward_zero_std": 0.25, "grad_norm": 0.1288452297449112, "kl": 0.2579329200088978, "learning_rate": 7.99872151498296e-06, "loss": -0.0488, "num_tokens": 24438859.0, "reward": 0.7587680220603943, "reward_std": 0.9354133009910583, "rewards/rollout_reward_func/mean": 0.7587680220603943, "rewards/rollout_reward_func/std": 0.9354133009910583, "sampling/importance_sampling_ratio/max": 1.2088243961334229, "sampling/importance_sampling_ratio/mean": 0.846251904964447, "sampling/importance_sampling_ratio/min": 0.0007557958015240729, "sampling/sampling_logp_difference/max": 1.207211971282959, "sampling/sampling_logp_difference/mean": 0.19263117015361786, "step": 965, "step_time": 24.184871563018532 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01315789483487606, "entropy": 1.0964238420128822, "epoch": 0.01932, "grad_norm": 0.11200374364852905, "kl": 0.2683631293475628, "learning_rate": 7.998718761307922e-06, "loss": -0.049, "step": 966, "step_time": 12.416690279962495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.8125, "completions/mean_terminated_length": 4.239999771118164, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7519115917384624, "epoch": 0.01934, "frac_reward_zero_std": 0.375, "grad_norm": 0.14278055727481842, "kl": 0.30338966473937035, "learning_rate": 7.998716004671194e-06, "loss": -0.0486, "num_tokens": 24492294.0, "reward": 0.9053899645805359, "reward_std": 0.7510256767272949, "rewards/rollout_reward_func/mean": 0.9053899645805359, "rewards/rollout_reward_func/std": 0.7510256767272949, "sampling/importance_sampling_ratio/max": 1.4379355907440186, "sampling/importance_sampling_ratio/mean": 0.7580493688583374, "sampling/importance_sampling_ratio/min": 7.176639087447256e-07, "sampling/sampling_logp_difference/max": 1.463080644607544, "sampling/sampling_logp_difference/mean": 0.29917049407958984, "step": 967, "step_time": 25.521165256999666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7498528808355331, "epoch": 0.01936, "grad_norm": 0.12790238857269287, "kl": 0.31295740976929665, "learning_rate": 7.998713245072779e-06, "loss": -0.0489, "step": 968, "step_time": 12.984030034014722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3706652224063873, "epoch": 0.01938, "frac_reward_zero_std": 0.25, "grad_norm": 0.04872964322566986, "kl": 0.4828987680375576, "learning_rate": 7.998710482512678e-06, "loss": -0.0711, "num_tokens": 24544711.0, "reward": 0.8133103847503662, "reward_std": 0.8293602466583252, "rewards/rollout_reward_func/mean": 0.8133103847503662, "rewards/rollout_reward_func/std": 0.82936030626297, "sampling/importance_sampling_ratio/max": 1.4980145692825317, "sampling/importance_sampling_ratio/mean": 0.8165569305419922, "sampling/importance_sampling_ratio/min": 0.00010813813423737884, "sampling/sampling_logp_difference/max": 2.284273147583008, "sampling/sampling_logp_difference/mean": 0.2520318031311035, "step": 969, "step_time": 25.27589831696241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.3715915083885193, "epoch": 0.0194, "grad_norm": 0.051509108394384384, "kl": 0.5123032256960869, "learning_rate": 7.9987077169909e-06, "loss": -0.0712, "step": 970, "step_time": 12.904539176990511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.46875, "completions/mean_terminated_length": 4.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3876252472400665, "epoch": 0.01942, "frac_reward_zero_std": 0.125, "grad_norm": 0.03980620950460434, "kl": 0.29536737874150276, "learning_rate": 7.99870494850744e-06, "loss": -0.0585, "num_tokens": 24607370.0, "reward": 0.45548784732818604, "reward_std": 0.8651227951049805, "rewards/rollout_reward_func/mean": 0.45548784732818604, "rewards/rollout_reward_func/std": 0.8651227951049805, "sampling/importance_sampling_ratio/max": 1.2426106929779053, "sampling/importance_sampling_ratio/mean": 0.6565449237823486, "sampling/importance_sampling_ratio/min": 7.920047551124298e-07, "sampling/sampling_logp_difference/max": 1.981052279472351, "sampling/sampling_logp_difference/mean": 0.3571053743362427, "step": 971, "step_time": 29.525872253987473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3808400630950928, "epoch": 0.01944, "grad_norm": 0.037879928946495056, "kl": 0.28920740634202957, "learning_rate": 7.998702177062306e-06, "loss": -0.0585, "step": 972, "step_time": 14.179862326040165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 4.5416669845581055, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8137705326080322, "epoch": 0.01946, "frac_reward_zero_std": 0.375, "grad_norm": 0.0357762910425663, "kl": 0.22511850856244564, "learning_rate": 7.998699402655498e-06, "loss": -0.0485, "num_tokens": 24662157.0, "reward": 0.6557356715202332, "reward_std": 0.9487630724906921, "rewards/rollout_reward_func/mean": 0.6557356715202332, "rewards/rollout_reward_func/std": 0.9487630128860474, "sampling/importance_sampling_ratio/max": 1.3672206401824951, "sampling/importance_sampling_ratio/mean": 0.7141883969306946, "sampling/importance_sampling_ratio/min": 3.6577287119143875e-06, "sampling/sampling_logp_difference/max": 1.9580864906311035, "sampling/sampling_logp_difference/mean": 0.28191882371902466, "step": 973, "step_time": 28.01835255100741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.81072336435318, "epoch": 0.01948, "grad_norm": 0.034904323518276215, "kl": 0.2198411114513874, "learning_rate": 7.998696625287019e-06, "loss": -0.0486, "step": 974, "step_time": 12.881772370019462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8839172422885895, "epoch": 0.0195, "frac_reward_zero_std": 0.125, "grad_norm": 0.1667032688856125, "kl": 1.1265920475125313, "learning_rate": 7.998693844956872e-06, "loss": -0.0738, "num_tokens": 24721075.0, "reward": 0.7534092664718628, "reward_std": 0.9021803736686707, "rewards/rollout_reward_func/mean": 0.7534092664718628, "rewards/rollout_reward_func/std": 0.9021803736686707, "sampling/importance_sampling_ratio/max": 1.1249741315841675, "sampling/importance_sampling_ratio/mean": 0.7035284042358398, "sampling/importance_sampling_ratio/min": 3.311263571958989e-05, "sampling/sampling_logp_difference/max": 2.2211406230926514, "sampling/sampling_logp_difference/mean": 0.325751930475235, "step": 975, "step_time": 26.54945286497241 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 1.878909945487976, "epoch": 0.01952, "grad_norm": 0.1197228878736496, "kl": 0.9156474620103836, "learning_rate": 7.998691061665061e-06, "loss": -0.0746, "step": 976, "step_time": 12.747173151001334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 6.53125, "completions/mean_terminated_length": 3.879999876022339, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.900278240442276, "epoch": 0.01954, "frac_reward_zero_std": 0.125, "grad_norm": 0.05045076832175255, "kl": 0.6203304976224899, "learning_rate": 7.998688275411588e-06, "loss": -0.0549, "num_tokens": 24779826.0, "reward": 0.6531127691268921, "reward_std": 0.8756875395774841, "rewards/rollout_reward_func/mean": 0.6531127691268921, "rewards/rollout_reward_func/std": 0.8756875395774841, "sampling/importance_sampling_ratio/max": 1.2718796730041504, "sampling/importance_sampling_ratio/mean": 0.7457532286643982, "sampling/importance_sampling_ratio/min": 7.488870323868468e-08, "sampling/sampling_logp_difference/max": 1.915161371231079, "sampling/sampling_logp_difference/mean": 0.37612152099609375, "step": 977, "step_time": 29.25851570200757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9059450030326843, "epoch": 0.01956, "grad_norm": 0.04860633239150047, "kl": 0.6097137629985809, "learning_rate": 7.998685486196455e-06, "loss": -0.0551, "step": 978, "step_time": 14.87817639001878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3786408323794603, "epoch": 0.01958, "frac_reward_zero_std": 0.5, "grad_norm": 0.011437340639531612, "kl": 0.2642633058130741, "learning_rate": 7.998682694019663e-06, "loss": -0.0501, "num_tokens": 24823491.0, "reward": 1.0014493465423584, "reward_std": 0.7392088770866394, "rewards/rollout_reward_func/mean": 1.0014493465423584, "rewards/rollout_reward_func/std": 0.7392088174819946, "sampling/importance_sampling_ratio/max": 1.0674424171447754, "sampling/importance_sampling_ratio/mean": 0.8504869937896729, "sampling/importance_sampling_ratio/min": 1.077329784493486e-06, "sampling/sampling_logp_difference/max": 2.2870869636535645, "sampling/sampling_logp_difference/mean": 0.23622554540634155, "step": 979, "step_time": 21.903188723983476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3806256353855133, "epoch": 0.0196, "grad_norm": 0.011844012886285782, "kl": 0.2691408656537533, "learning_rate": 7.99867989888122e-06, "loss": -0.0501, "step": 980, "step_time": 11.623307611007476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.870967388153076, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0885002985596657, "epoch": 0.01962, "frac_reward_zero_std": 0.25, "grad_norm": 0.09165627509355545, "kl": 0.6001616343855858, "learning_rate": 7.998677100781126e-06, "loss": -0.0365, "num_tokens": 24878420.0, "reward": 0.7795955538749695, "reward_std": 0.8101626038551331, "rewards/rollout_reward_func/mean": 0.7795955538749695, "rewards/rollout_reward_func/std": 0.8101626038551331, "sampling/importance_sampling_ratio/max": 1.1918314695358276, "sampling/importance_sampling_ratio/mean": 0.8468402624130249, "sampling/importance_sampling_ratio/min": 0.0001006928359856829, "sampling/sampling_logp_difference/max": 1.6812843084335327, "sampling/sampling_logp_difference/mean": 0.20795372128486633, "step": 981, "step_time": 22.98417667299509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1044384725391865, "epoch": 0.01964, "grad_norm": 0.09690061956644058, "kl": 0.5695957988500595, "learning_rate": 7.998674299719382e-06, "loss": -0.0367, "step": 982, "step_time": 12.172312849987065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.0, "completions/mean_terminated_length": 4.5714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1923428773880005, "epoch": 0.01966, "frac_reward_zero_std": 0.25, "grad_norm": 0.017458593472838402, "kl": 0.2972240075469017, "learning_rate": 7.998671495695992e-06, "loss": -0.0553, "num_tokens": 24930468.0, "reward": 0.3241874873638153, "reward_std": 0.9231465458869934, "rewards/rollout_reward_func/mean": 0.3241874873638153, "rewards/rollout_reward_func/std": 0.9231464862823486, "sampling/importance_sampling_ratio/max": 1.1942120790481567, "sampling/importance_sampling_ratio/mean": 0.8545041084289551, "sampling/importance_sampling_ratio/min": 1.9631894247140735e-05, "sampling/sampling_logp_difference/max": 1.6282036304473877, "sampling/sampling_logp_difference/mean": 0.2107027769088745, "step": 983, "step_time": 28.353066106006736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.200646050274372, "epoch": 0.01968, "grad_norm": 0.018152087926864624, "kl": 0.30161283165216446, "learning_rate": 7.99866868871096e-06, "loss": -0.0553, "step": 984, "step_time": 14.15487968098023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 5.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1671313643455505, "epoch": 0.0197, "frac_reward_zero_std": 0.125, "grad_norm": 0.5739511847496033, "kl": 0.22124380618333817, "learning_rate": 7.998665878764288e-06, "loss": -0.0351, "num_tokens": 24981375.0, "reward": 0.3099816143512726, "reward_std": 0.8437848687171936, "rewards/rollout_reward_func/mean": 0.3099816143512726, "rewards/rollout_reward_func/std": 0.8437848687171936, "sampling/importance_sampling_ratio/max": 1.2101753950119019, "sampling/importance_sampling_ratio/mean": 0.6453642249107361, "sampling/importance_sampling_ratio/min": 6.251971740312001e-08, "sampling/sampling_logp_difference/max": 2.196345329284668, "sampling/sampling_logp_difference/mean": 0.3651145100593567, "step": 985, "step_time": 24.73377143696416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03330592066049576, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03330592066049576, "entropy": 2.2143504917621613, "epoch": 0.01972, "grad_norm": 0.25260791182518005, "kl": 0.30061110854148865, "learning_rate": 7.998663065855979e-06, "loss": -0.0371, "step": 986, "step_time": 11.542457809991902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.758620738983154, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6346799284219742, "epoch": 0.01974, "frac_reward_zero_std": 0.25, "grad_norm": 0.10223368555307388, "kl": 0.2272159904241562, "learning_rate": 7.998660249986034e-06, "loss": -0.0416, "num_tokens": 25033497.0, "reward": 0.6088603734970093, "reward_std": 0.8541015982627869, "rewards/rollout_reward_func/mean": 0.6088603734970093, "rewards/rollout_reward_func/std": 0.8541015386581421, "sampling/importance_sampling_ratio/max": 1.3605625629425049, "sampling/importance_sampling_ratio/mean": 0.7805777192115784, "sampling/importance_sampling_ratio/min": 8.692126698406355e-07, "sampling/sampling_logp_difference/max": 2.430068254470825, "sampling/sampling_logp_difference/mean": 0.31895262002944946, "step": 987, "step_time": 24.44483972902526 }, { "clip_ratio/high_max": 0.018928571604192257, "clip_ratio/high_mean": 0.009464285802096128, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009464285802096128, "entropy": 1.6301162093877792, "epoch": 0.01976, "grad_norm": 0.09911707043647766, "kl": 0.22385342046618462, "learning_rate": 7.998657431154457e-06, "loss": -0.0423, "step": 988, "step_time": 12.08531808303087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.480000019073486, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.06561878323555, "epoch": 0.01978, "frac_reward_zero_std": 0.125, "grad_norm": 0.02965831384062767, "kl": 0.3933737836778164, "learning_rate": 7.998654609361253e-06, "loss": -0.065, "num_tokens": 25087558.0, "reward": 0.678896427154541, "reward_std": 0.9161184430122375, "rewards/rollout_reward_func/mean": 0.678896427154541, "rewards/rollout_reward_func/std": 0.9161184430122375, "sampling/importance_sampling_ratio/max": 1.1917110681533813, "sampling/importance_sampling_ratio/mean": 0.7134039402008057, "sampling/importance_sampling_ratio/min": 2.5840840578439384e-08, "sampling/sampling_logp_difference/max": 2.6702075004577637, "sampling/sampling_logp_difference/mean": 0.34127187728881836, "step": 989, "step_time": 26.15549459599424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.06745707988739, "epoch": 0.0198, "grad_norm": 0.03097226656973362, "kl": 0.4023287482559681, "learning_rate": 7.99865178460642e-06, "loss": -0.0651, "step": 990, "step_time": 12.330453357979422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 5.642857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6972489953041077, "epoch": 0.01982, "frac_reward_zero_std": 0.25, "grad_norm": 0.5238308310508728, "kl": 0.2237449586391449, "learning_rate": 7.998648956889966e-06, "loss": -0.0522, "num_tokens": 25140433.0, "reward": 0.5849642157554626, "reward_std": 0.9041756987571716, "rewards/rollout_reward_func/mean": 0.5849642157554626, "rewards/rollout_reward_func/std": 0.9041756987571716, "sampling/importance_sampling_ratio/max": 1.836624264717102, "sampling/importance_sampling_ratio/mean": 0.7733287811279297, "sampling/importance_sampling_ratio/min": 9.22499384614639e-05, "sampling/sampling_logp_difference/max": 1.6380019187927246, "sampling/sampling_logp_difference/mean": 0.23660731315612793, "step": 991, "step_time": 29.682278141990537 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006801470648497343, "entropy": 1.696266084909439, "epoch": 0.01984, "grad_norm": 0.10968637466430664, "kl": 0.22561510279774666, "learning_rate": 7.998646126211891e-06, "loss": -0.0548, "step": 992, "step_time": 14.07746898199548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0400757044553757, "epoch": 0.01986, "frac_reward_zero_std": 0.25, "grad_norm": 0.06490226835012436, "kl": 0.4942829832434654, "learning_rate": 7.998643292572196e-06, "loss": -0.0534, "num_tokens": 25194734.0, "reward": 0.7215322852134705, "reward_std": 0.7790694236755371, "rewards/rollout_reward_func/mean": 0.7215322852134705, "rewards/rollout_reward_func/std": 0.7790694236755371, "sampling/importance_sampling_ratio/max": 1.3402360677719116, "sampling/importance_sampling_ratio/mean": 0.9273396730422974, "sampling/importance_sampling_ratio/min": 1.0230120096821338e-05, "sampling/sampling_logp_difference/max": 2.0429649353027344, "sampling/sampling_logp_difference/mean": 0.23739299178123474, "step": 993, "step_time": 23.48892256198451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0386626422405243, "epoch": 0.01988, "grad_norm": 0.06458072364330292, "kl": 0.49960826337337494, "learning_rate": 7.998640455970888e-06, "loss": -0.0535, "step": 994, "step_time": 12.943837157014059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.125, "completions/mean_terminated_length": 4.545454502105713, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.505031406879425, "epoch": 0.0199, "frac_reward_zero_std": 0.125, "grad_norm": 0.21158663928508759, "kl": 0.15911671705543995, "learning_rate": 7.998637616407968e-06, "loss": -0.0553, "num_tokens": 25250344.0, "reward": 0.32934391498565674, "reward_std": 0.8935753703117371, "rewards/rollout_reward_func/mean": 0.32934391498565674, "rewards/rollout_reward_func/std": 0.8935753107070923, "sampling/importance_sampling_ratio/max": 1.2080881595611572, "sampling/importance_sampling_ratio/mean": 0.6400542259216309, "sampling/importance_sampling_ratio/min": 1.2829748818887765e-08, "sampling/sampling_logp_difference/max": 2.356867790222168, "sampling/sampling_logp_difference/mean": 0.44739198684692383, "step": 995, "step_time": 30.195484125986695 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.5125443041324615, "epoch": 0.01992, "grad_norm": 0.2552334666252136, "kl": 0.1541472189128399, "learning_rate": 7.998634773883437e-06, "loss": -0.0569, "step": 996, "step_time": 12.88032817299245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.673232764005661, "epoch": 0.01994, "frac_reward_zero_std": 0.25, "grad_norm": 0.1605309247970581, "kl": 0.47864845395088196, "learning_rate": 7.9986319283973e-06, "loss": -0.0352, "num_tokens": 25305323.0, "reward": 0.8227241039276123, "reward_std": 0.6634626388549805, "rewards/rollout_reward_func/mean": 0.8227241039276123, "rewards/rollout_reward_func/std": 0.6634626388549805, "sampling/importance_sampling_ratio/max": 1.1964242458343506, "sampling/importance_sampling_ratio/mean": 0.8340375423431396, "sampling/importance_sampling_ratio/min": 1.6923475243402208e-07, "sampling/sampling_logp_difference/max": 2.3009324073791504, "sampling/sampling_logp_difference/mean": 0.32441121339797974, "step": 997, "step_time": 23.744231863005552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6895909905433655, "epoch": 0.01996, "grad_norm": 0.1572604775428772, "kl": 0.48209813982248306, "learning_rate": 7.99862907994956e-06, "loss": -0.0357, "step": 998, "step_time": 12.83544237705064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.40625, "completions/mean_terminated_length": 4.310344696044922, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.554191142320633, "epoch": 0.01998, "frac_reward_zero_std": 0.5, "grad_norm": 0.12172026932239532, "kl": 1.1693013794720173, "learning_rate": 7.998626228540219e-06, "loss": -0.0281, "num_tokens": 25349151.0, "reward": 0.4648004174232483, "reward_std": 0.837614119052887, "rewards/rollout_reward_func/mean": 0.4648004174232483, "rewards/rollout_reward_func/std": 0.837614119052887, "sampling/importance_sampling_ratio/max": 1.1718621253967285, "sampling/importance_sampling_ratio/mean": 0.7855414152145386, "sampling/importance_sampling_ratio/min": 2.771025037873187e-06, "sampling/sampling_logp_difference/max": 1.8733519315719604, "sampling/sampling_logp_difference/mean": 0.25008365511894226, "step": 999, "step_time": 24.343418720032787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.570234775543213, "epoch": 0.02, "grad_norm": 0.09646601974964142, "kl": 1.071443434804678, "learning_rate": 7.998623374169279e-06, "loss": -0.0285, "step": 1000, "step_time": 12.85610230799648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.344827651977539, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6018833257257938, "epoch": 0.02002, "frac_reward_zero_std": 0.25, "grad_norm": 0.12632295489311218, "kl": 0.3759583607316017, "learning_rate": 7.998620516836743e-06, "loss": -0.0494, "num_tokens": 25392604.0, "reward": 0.7409913539886475, "reward_std": 0.8576152920722961, "rewards/rollout_reward_func/mean": 0.7409913539886475, "rewards/rollout_reward_func/std": 0.8576152920722961, "sampling/importance_sampling_ratio/max": 1.3451781272888184, "sampling/importance_sampling_ratio/mean": 0.8751301765441895, "sampling/importance_sampling_ratio/min": 9.435897396770088e-09, "sampling/sampling_logp_difference/max": 2.2226178646087646, "sampling/sampling_logp_difference/mean": 0.3025854229927063, "step": 1001, "step_time": 21.976253063010518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6108513846993446, "epoch": 0.02004, "grad_norm": 0.11439148336648941, "kl": 0.37727784365415573, "learning_rate": 7.998617656542615e-06, "loss": -0.0497, "step": 1002, "step_time": 11.952094657957787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.46875, "completions/mean_terminated_length": 5.107142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8560167253017426, "epoch": 0.02006, "frac_reward_zero_std": 0.125, "grad_norm": 0.09883952140808105, "kl": 0.4791669733822346, "learning_rate": 7.998614793286897e-06, "loss": -0.0556, "num_tokens": 25448118.0, "reward": 0.7323142290115356, "reward_std": 0.9036542177200317, "rewards/rollout_reward_func/mean": 0.7323142290115356, "rewards/rollout_reward_func/std": 0.903654158115387, "sampling/importance_sampling_ratio/max": 1.2151914834976196, "sampling/importance_sampling_ratio/mean": 0.6697738766670227, "sampling/importance_sampling_ratio/min": 9.520499588688836e-06, "sampling/sampling_logp_difference/max": 2.678187131881714, "sampling/sampling_logp_difference/mean": 0.3272981345653534, "step": 1003, "step_time": 29.210743147035828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8593790903687477, "epoch": 0.02008, "grad_norm": 0.09729461371898651, "kl": 0.44565274752676487, "learning_rate": 7.998611927069592e-06, "loss": -0.0558, "step": 1004, "step_time": 14.136504680966027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.15625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7209931015968323, "epoch": 0.0201, "frac_reward_zero_std": 0.375, "grad_norm": 0.07509568333625793, "kl": 0.21669670566916466, "learning_rate": 7.998609057890704e-06, "loss": -0.0351, "num_tokens": 25497629.0, "reward": 0.6817891001701355, "reward_std": 0.8569533824920654, "rewards/rollout_reward_func/mean": 0.6817891001701355, "rewards/rollout_reward_func/std": 0.8569533824920654, "sampling/importance_sampling_ratio/max": 1.2214316129684448, "sampling/importance_sampling_ratio/mean": 0.8126102685928345, "sampling/importance_sampling_ratio/min": 5.289598266244866e-05, "sampling/sampling_logp_difference/max": 1.697870135307312, "sampling/sampling_logp_difference/mean": 0.25317397713661194, "step": 1005, "step_time": 24.94055528898025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7123540937900543, "epoch": 0.02012, "grad_norm": 0.07384396344423294, "kl": 0.21443554759025574, "learning_rate": 7.998606185750234e-06, "loss": -0.0352, "step": 1006, "step_time": 12.355434911005432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.0625, "completions/mean_terminated_length": 4.333333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1299760416150093, "epoch": 0.02014, "frac_reward_zero_std": 0.375, "grad_norm": 0.050187524408102036, "kl": 0.42526815831661224, "learning_rate": 7.998603310648185e-06, "loss": -0.038, "num_tokens": 25543340.0, "reward": 0.9865965843200684, "reward_std": 0.774745762348175, "rewards/rollout_reward_func/mean": 0.9865965843200684, "rewards/rollout_reward_func/std": 0.7747458219528198, "sampling/importance_sampling_ratio/max": 1.1839443445205688, "sampling/importance_sampling_ratio/mean": 0.8421745300292969, "sampling/importance_sampling_ratio/min": 4.821355105377734e-06, "sampling/sampling_logp_difference/max": 2.056637763977051, "sampling/sampling_logp_difference/mean": 0.22924581170082092, "step": 1007, "step_time": 25.241983899992192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.127401776611805, "epoch": 0.02016, "grad_norm": 0.04815196990966797, "kl": 0.4312765896320343, "learning_rate": 7.998600432584561e-06, "loss": -0.0381, "step": 1008, "step_time": 12.655202224006644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005434782709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005434782709926367, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.785714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8922263085842133, "epoch": 0.02018, "frac_reward_zero_std": 0.125, "grad_norm": 0.11883921921253204, "kl": 0.2918071933090687, "learning_rate": 7.998597551559365e-06, "loss": -0.0537, "num_tokens": 25598862.0, "reward": 0.5267260074615479, "reward_std": 0.8482297658920288, "rewards/rollout_reward_func/mean": 0.5267260074615479, "rewards/rollout_reward_func/std": 0.8482297658920288, "sampling/importance_sampling_ratio/max": 1.2883011102676392, "sampling/importance_sampling_ratio/mean": 0.7156966924667358, "sampling/importance_sampling_ratio/min": 1.878908939545454e-08, "sampling/sampling_logp_difference/max": 2.0430588722229004, "sampling/sampling_logp_difference/mean": 0.3632986843585968, "step": 1009, "step_time": 30.007371434010565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8725111782550812, "epoch": 0.0202, "grad_norm": 0.10546396672725677, "kl": 0.2897592671215534, "learning_rate": 7.998594667572597e-06, "loss": -0.0543, "step": 1010, "step_time": 15.239562102011405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.71875, "completions/mean_terminated_length": 4.4782609939575195, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.782999575138092, "epoch": 0.02022, "frac_reward_zero_std": 0.0, "grad_norm": 0.05008091405034065, "kl": 0.2669501565396786, "learning_rate": 7.998591780624263e-06, "loss": -0.0917, "num_tokens": 25661899.0, "reward": 0.30110886693000793, "reward_std": 0.8942062258720398, "rewards/rollout_reward_func/mean": 0.30110886693000793, "rewards/rollout_reward_func/std": 0.8942062854766846, "sampling/importance_sampling_ratio/max": 1.2984074354171753, "sampling/importance_sampling_ratio/mean": 0.6308752298355103, "sampling/importance_sampling_ratio/min": 2.980701196975133e-07, "sampling/sampling_logp_difference/max": 2.1318604946136475, "sampling/sampling_logp_difference/mean": 0.474065899848938, "step": 1011, "step_time": 34.20920359104639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.7664361596107483, "epoch": 0.02024, "grad_norm": 0.046384770423173904, "kl": 0.27387160807847977, "learning_rate": 7.998588890714367e-06, "loss": -0.0919, "step": 1012, "step_time": 14.804636870016111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9883537292480469, "epoch": 0.02026, "frac_reward_zero_std": 0.375, "grad_norm": 0.09262709319591522, "kl": 0.18091058358550072, "learning_rate": 7.998585997842907e-06, "loss": -0.0586, "num_tokens": 25720812.0, "reward": 0.6944676637649536, "reward_std": 0.9400820732116699, "rewards/rollout_reward_func/mean": 0.6944676637649536, "rewards/rollout_reward_func/std": 0.9400820136070251, "sampling/importance_sampling_ratio/max": 1.8788191080093384, "sampling/importance_sampling_ratio/mean": 0.7682949304580688, "sampling/importance_sampling_ratio/min": 5.705047545490061e-09, "sampling/sampling_logp_difference/max": 2.0639114379882812, "sampling/sampling_logp_difference/mean": 0.3141902685165405, "step": 1013, "step_time": 32.486778523016255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.98308464884758, "epoch": 0.02028, "grad_norm": 0.09518013894557953, "kl": 0.1852332465350628, "learning_rate": 7.998583102009889e-06, "loss": -0.059, "step": 1014, "step_time": 15.643134810961783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.230597097426653, "epoch": 0.0203, "frac_reward_zero_std": 0.5, "grad_norm": 0.03378254175186157, "kl": 0.2687136307358742, "learning_rate": 7.998580203215317e-06, "loss": -0.0385, "num_tokens": 25769374.0, "reward": 0.9084150195121765, "reward_std": 0.9028205871582031, "rewards/rollout_reward_func/mean": 0.9084150195121765, "rewards/rollout_reward_func/std": 0.9028206467628479, "sampling/importance_sampling_ratio/max": 1.1371492147445679, "sampling/importance_sampling_ratio/mean": 0.8374693393707275, "sampling/importance_sampling_ratio/min": 0.00031543264049105346, "sampling/sampling_logp_difference/max": 2.5785789489746094, "sampling/sampling_logp_difference/mean": 0.20416447520256042, "step": 1015, "step_time": 21.101963413035264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2219487875699997, "epoch": 0.02032, "grad_norm": 0.03368889540433884, "kl": 0.26544713228940964, "learning_rate": 7.99857730145919e-06, "loss": -0.0385, "step": 1016, "step_time": 10.757474391983123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.2528753876686096, "epoch": 0.02034, "frac_reward_zero_std": 0.0, "grad_norm": 0.0667426735162735, "kl": 0.5470127835869789, "learning_rate": 7.998574396741514e-06, "loss": -0.0844, "num_tokens": 25828600.0, "reward": 0.448589026927948, "reward_std": 0.8893107771873474, "rewards/rollout_reward_func/mean": 0.448589026927948, "rewards/rollout_reward_func/std": 0.8893107175827026, "sampling/importance_sampling_ratio/max": 1.1911733150482178, "sampling/importance_sampling_ratio/mean": 0.651749849319458, "sampling/importance_sampling_ratio/min": 4.0947629713627975e-06, "sampling/sampling_logp_difference/max": 2.083441734313965, "sampling/sampling_logp_difference/mean": 0.3741360306739807, "step": 1017, "step_time": 29.072070079972036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 2.2507379353046417, "epoch": 0.02036, "grad_norm": 0.0849124938249588, "kl": 0.5943334847688675, "learning_rate": 7.998571489062291e-06, "loss": -0.0845, "step": 1018, "step_time": 14.827478080027504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.59375, "completions/mean_terminated_length": 4.900000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.403414037078619, "epoch": 0.02038, "frac_reward_zero_std": 0.375, "grad_norm": 0.07702852785587311, "kl": 0.4356771409511566, "learning_rate": 7.998568578421523e-06, "loss": -0.0479, "num_tokens": 25880843.0, "reward": 0.6293737888336182, "reward_std": 0.8391358256340027, "rewards/rollout_reward_func/mean": 0.6293737888336182, "rewards/rollout_reward_func/std": 0.8391358256340027, "sampling/importance_sampling_ratio/max": 1.263372540473938, "sampling/importance_sampling_ratio/mean": 0.9005444049835205, "sampling/importance_sampling_ratio/min": 6.948728398237236e-11, "sampling/sampling_logp_difference/max": 2.446502208709717, "sampling/sampling_logp_difference/mean": 0.28263601660728455, "step": 1019, "step_time": 24.63241228001425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4038067031651735, "epoch": 0.0204, "grad_norm": 0.07620200514793396, "kl": 0.4727563038468361, "learning_rate": 7.998565664819215e-06, "loss": -0.0479, "step": 1020, "step_time": 12.925605071039172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.833333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5949964821338654, "epoch": 0.02042, "frac_reward_zero_std": 0.125, "grad_norm": 0.32143789529800415, "kl": 2.799627847969532, "learning_rate": 7.998562748255368e-06, "loss": -0.0475, "num_tokens": 25934660.0, "reward": 0.5785032510757446, "reward_std": 0.7318413853645325, "rewards/rollout_reward_func/mean": 0.5785032510757446, "rewards/rollout_reward_func/std": 0.7318413853645325, "sampling/importance_sampling_ratio/max": 1.2140288352966309, "sampling/importance_sampling_ratio/mean": 0.7530128359794617, "sampling/importance_sampling_ratio/min": 1.6133192559664167e-07, "sampling/sampling_logp_difference/max": 2.064375400543213, "sampling/sampling_logp_difference/mean": 0.3091781735420227, "step": 1021, "step_time": 24.447090176021447 }, { "clip_ratio/high_max": 0.02452153153717518, "clip_ratio/high_mean": 0.01226076576858759, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016725051682442427, "entropy": 1.6103624552488327, "epoch": 0.02044, "grad_norm": 0.23851683735847473, "kl": 2.0887812077999115, "learning_rate": 7.998559828729986e-06, "loss": -0.0505, "step": 1022, "step_time": 13.276284277962986 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.0650634467601776, "epoch": 0.02046, "frac_reward_zero_std": 0.25, "grad_norm": 0.254467636346817, "kl": 0.9284754414111376, "learning_rate": 7.99855690624307e-06, "loss": -0.047, "num_tokens": 25991287.0, "reward": 0.23655441403388977, "reward_std": 0.7814993262290955, "rewards/rollout_reward_func/mean": 0.23655441403388977, "rewards/rollout_reward_func/std": 0.7814993858337402, "sampling/importance_sampling_ratio/max": 1.4221546649932861, "sampling/importance_sampling_ratio/mean": 0.7108238935470581, "sampling/importance_sampling_ratio/min": 3.4585561934363795e-06, "sampling/sampling_logp_difference/max": 1.7489559650421143, "sampling/sampling_logp_difference/mean": 0.3006112277507782, "step": 1023, "step_time": 27.831689255981473 }, { "clip_ratio/high_max": 0.027529762126505375, "clip_ratio/high_mean": 0.013764881063252687, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013764881063252687, "entropy": 2.095166891813278, "epoch": 0.02048, "grad_norm": 0.12437517195940018, "kl": 0.43450064957141876, "learning_rate": 7.998553980794626e-06, "loss": -0.0494, "step": 1024, "step_time": 12.728351409983588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.1875, "completions/mean_terminated_length": 4.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5259836316108704, "epoch": 0.0205, "frac_reward_zero_std": 0.125, "grad_norm": 0.028204962611198425, "kl": 0.35901160910725594, "learning_rate": 7.998551052384655e-06, "loss": -0.056, "num_tokens": 26036437.0, "reward": 0.8671233654022217, "reward_std": 0.8225075602531433, "rewards/rollout_reward_func/mean": 0.8671233654022217, "rewards/rollout_reward_func/std": 0.8225075006484985, "sampling/importance_sampling_ratio/max": 1.1354631185531616, "sampling/importance_sampling_ratio/mean": 0.8325263857841492, "sampling/importance_sampling_ratio/min": 2.598418120669521e-07, "sampling/sampling_logp_difference/max": 1.719447374343872, "sampling/sampling_logp_difference/mean": 0.3002728819847107, "step": 1025, "step_time": 20.326370297989342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5447906255722046, "epoch": 0.02052, "grad_norm": 0.03379298374056816, "kl": 0.336632814258337, "learning_rate": 7.998548121013159e-06, "loss": -0.0558, "step": 1026, "step_time": 10.527363987028366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 7.15625, "completions/mean_terminated_length": 4.208333492279053, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2831755578517914, "epoch": 0.02054, "frac_reward_zero_std": 0.0, "grad_norm": 0.07757648080587387, "kl": 0.2905869875103235, "learning_rate": 7.998545186680142e-06, "loss": -0.0656, "num_tokens": 26097930.0, "reward": 0.24388840794563293, "reward_std": 0.8178853988647461, "rewards/rollout_reward_func/mean": 0.24388840794563293, "rewards/rollout_reward_func/std": 0.8178853988647461, "sampling/importance_sampling_ratio/max": 1.4231064319610596, "sampling/importance_sampling_ratio/mean": 0.7280651330947876, "sampling/importance_sampling_ratio/min": 3.444939409291692e-07, "sampling/sampling_logp_difference/max": 2.621164560317993, "sampling/sampling_logp_difference/mean": 0.37442511320114136, "step": 1027, "step_time": 30.081557189987507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.295133590698242, "epoch": 0.02056, "grad_norm": 0.08259354531764984, "kl": 0.27790845558047295, "learning_rate": 7.998542249385607e-06, "loss": -0.0653, "step": 1028, "step_time": 15.166755881975405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.34375, "completions/mean_terminated_length": 4.964285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.1933720111846924, "epoch": 0.02058, "frac_reward_zero_std": 0.125, "grad_norm": 0.06600581109523773, "kl": 0.3749687075614929, "learning_rate": 7.998539309129558e-06, "loss": -0.0868, "num_tokens": 26143846.0, "reward": 0.7705837488174438, "reward_std": 0.8519506454467773, "rewards/rollout_reward_func/mean": 0.7705837488174438, "rewards/rollout_reward_func/std": 0.8519505858421326, "sampling/importance_sampling_ratio/max": 1.5818357467651367, "sampling/importance_sampling_ratio/mean": 0.7658353447914124, "sampling/importance_sampling_ratio/min": 2.560840641763207e-07, "sampling/sampling_logp_difference/max": 2.139366388320923, "sampling/sampling_logp_difference/mean": 0.42238977551460266, "step": 1029, "step_time": 24.256476220005425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1956806778907776, "epoch": 0.0206, "grad_norm": 0.06869316846132278, "kl": 0.3673676624894142, "learning_rate": 7.998536365911996e-06, "loss": -0.0866, "step": 1030, "step_time": 12.262781547993654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.90625, "completions/mean_terminated_length": 4.807692527770996, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.557185113430023, "epoch": 0.02062, "frac_reward_zero_std": 0.0, "grad_norm": 0.04698825255036354, "kl": 0.3848050683736801, "learning_rate": 7.998533419732923e-06, "loss": -0.0693, "num_tokens": 26200936.0, "reward": 0.8984103798866272, "reward_std": 0.785402774810791, "rewards/rollout_reward_func/mean": 0.8984103798866272, "rewards/rollout_reward_func/std": 0.7854027152061462, "sampling/importance_sampling_ratio/max": 1.3804913759231567, "sampling/importance_sampling_ratio/mean": 0.6887458562850952, "sampling/importance_sampling_ratio/min": 8.865574940841725e-09, "sampling/sampling_logp_difference/max": 2.412773609161377, "sampling/sampling_logp_difference/mean": 0.4068428874015808, "step": 1031, "step_time": 28.62530293900636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0059523810632526875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "entropy": 2.5561195015907288, "epoch": 0.02064, "grad_norm": 0.0452975332736969, "kl": 0.40083223581314087, "learning_rate": 7.998530470592344e-06, "loss": -0.0693, "step": 1032, "step_time": 14.981719581002835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 5.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7722079455852509, "epoch": 0.02066, "frac_reward_zero_std": 0.25, "grad_norm": 0.048670828342437744, "kl": 0.35317613929510117, "learning_rate": 7.998527518490262e-06, "loss": -0.0405, "num_tokens": 26257675.0, "reward": 0.8374674916267395, "reward_std": 0.7573046684265137, "rewards/rollout_reward_func/mean": 0.8374674916267395, "rewards/rollout_reward_func/std": 0.7573046088218689, "sampling/importance_sampling_ratio/max": 1.3791346549987793, "sampling/importance_sampling_ratio/mean": 0.7197098731994629, "sampling/importance_sampling_ratio/min": 1.205138801196881e-06, "sampling/sampling_logp_difference/max": 1.9278013706207275, "sampling/sampling_logp_difference/mean": 0.29910773038864136, "step": 1033, "step_time": 26.05749459497747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.766334056854248, "epoch": 0.02068, "grad_norm": 0.053760040551424026, "kl": 0.35142140090465546, "learning_rate": 7.99852456342668e-06, "loss": -0.0404, "step": 1034, "step_time": 13.176110739033902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.8275861740112305, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.680861011147499, "epoch": 0.0207, "frac_reward_zero_std": 0.125, "grad_norm": 0.048179324716329575, "kl": 0.3361874967813492, "learning_rate": 7.998521605401599e-06, "loss": -0.0444, "num_tokens": 26309230.0, "reward": 0.6766096353530884, "reward_std": 0.8067501187324524, "rewards/rollout_reward_func/mean": 0.6766096353530884, "rewards/rollout_reward_func/std": 0.8067500591278076, "sampling/importance_sampling_ratio/max": 1.207259178161621, "sampling/importance_sampling_ratio/mean": 0.7643610835075378, "sampling/importance_sampling_ratio/min": 6.825577656854875e-06, "sampling/sampling_logp_difference/max": 1.9366165399551392, "sampling/sampling_logp_difference/mean": 0.2670416831970215, "step": 1035, "step_time": 23.238599698990583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6682343035936356, "epoch": 0.02072, "grad_norm": 0.04214305430650711, "kl": 0.3237243555486202, "learning_rate": 7.998518644415023e-06, "loss": -0.0446, "step": 1036, "step_time": 12.27746747501078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.40625, "completions/mean_terminated_length": 5.42307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3377639949321747, "epoch": 0.02074, "frac_reward_zero_std": 0.25, "grad_norm": 0.0429641492664814, "kl": 0.2435160130262375, "learning_rate": 7.998515680466955e-06, "loss": -0.0717, "num_tokens": 26366769.0, "reward": 0.5797520875930786, "reward_std": 0.9058220386505127, "rewards/rollout_reward_func/mean": 0.5797520875930786, "rewards/rollout_reward_func/std": 0.9058220386505127, "sampling/importance_sampling_ratio/max": 1.2888435125350952, "sampling/importance_sampling_ratio/mean": 0.6331292986869812, "sampling/importance_sampling_ratio/min": 4.657359227167035e-07, "sampling/sampling_logp_difference/max": 2.2894198894500732, "sampling/sampling_logp_difference/mean": 0.368182897567749, "step": 1037, "step_time": 28.102284025982954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.330658793449402, "epoch": 0.02076, "grad_norm": 0.04057968407869339, "kl": 0.25706950202584267, "learning_rate": 7.9985127135574e-06, "loss": -0.0718, "step": 1038, "step_time": 13.281655060040066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.962963104248047, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9255053102970123, "epoch": 0.02078, "frac_reward_zero_std": 0.0, "grad_norm": 0.17005905508995056, "kl": 0.3410212956368923, "learning_rate": 7.998509743686357e-06, "loss": -0.0716, "num_tokens": 26419598.0, "reward": 0.7016487121582031, "reward_std": 0.8742042779922485, "rewards/rollout_reward_func/mean": 0.7016487121582031, "rewards/rollout_reward_func/std": 0.8742042779922485, "sampling/importance_sampling_ratio/max": 1.2201465368270874, "sampling/importance_sampling_ratio/mean": 0.7023848295211792, "sampling/importance_sampling_ratio/min": 7.082794581947383e-07, "sampling/sampling_logp_difference/max": 2.0512073040008545, "sampling/sampling_logp_difference/mean": 0.3243796229362488, "step": 1039, "step_time": 25.835532109980704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9171003103256226, "epoch": 0.0208, "grad_norm": 0.1702578067779541, "kl": 0.35851186141371727, "learning_rate": 7.99850677085383e-06, "loss": -0.072, "step": 1040, "step_time": 13.501620844006538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.84375, "completions/mean_terminated_length": 4.392857551574707, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5673214495182037, "epoch": 0.02082, "frac_reward_zero_std": 0.125, "grad_norm": 0.07931418716907501, "kl": 0.22473983839154243, "learning_rate": 7.998503795059823e-06, "loss": -0.0385, "num_tokens": 26468120.0, "reward": 0.14483650028705597, "reward_std": 0.8475173711776733, "rewards/rollout_reward_func/mean": 0.14483650028705597, "rewards/rollout_reward_func/std": 0.8475173115730286, "sampling/importance_sampling_ratio/max": 1.082991361618042, "sampling/importance_sampling_ratio/mean": 0.8245063424110413, "sampling/importance_sampling_ratio/min": 4.1032805597751576e-08, "sampling/sampling_logp_difference/max": 2.0567855834960938, "sampling/sampling_logp_difference/mean": 0.25284701585769653, "step": 1041, "step_time": 23.680381147976732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.5694420039653778, "epoch": 0.02084, "grad_norm": 0.07714101672172546, "kl": 0.22970475628972054, "learning_rate": 7.998500816304342e-06, "loss": -0.0387, "step": 1042, "step_time": 11.978923237969866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 5.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.9948716759681702, "epoch": 0.02086, "frac_reward_zero_std": 0.0, "grad_norm": 0.05176990479230881, "kl": 0.5126195475459099, "learning_rate": 7.998497834587384e-06, "loss": -0.0421, "num_tokens": 26522301.0, "reward": 0.7050657272338867, "reward_std": 0.8356267213821411, "rewards/rollout_reward_func/mean": 0.7050657272338867, "rewards/rollout_reward_func/std": 0.8356266617774963, "sampling/importance_sampling_ratio/max": 1.1918514966964722, "sampling/importance_sampling_ratio/mean": 0.771774172782898, "sampling/importance_sampling_ratio/min": 3.8830245330245816e-07, "sampling/sampling_logp_difference/max": 2.04154896736145, "sampling/sampling_logp_difference/mean": 0.361605167388916, "step": 1043, "step_time": 26.80650498298928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9923137426376343, "epoch": 0.02088, "grad_norm": 0.057574789971113205, "kl": 0.5350692197680473, "learning_rate": 7.998494849908953e-06, "loss": -0.042, "step": 1044, "step_time": 13.774894617992686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.5364491939544678, "epoch": 0.0209, "frac_reward_zero_std": 0.25, "grad_norm": 0.03713465854525566, "kl": 0.34011052176356316, "learning_rate": 7.998491862269058e-06, "loss": -0.0692, "num_tokens": 26568500.0, "reward": 0.6409585475921631, "reward_std": 0.8300343155860901, "rewards/rollout_reward_func/mean": 0.6409585475921631, "rewards/rollout_reward_func/std": 0.8300343155860901, "sampling/importance_sampling_ratio/max": 1.1992627382278442, "sampling/importance_sampling_ratio/mean": 0.7664770483970642, "sampling/importance_sampling_ratio/min": 1.2454500764036425e-09, "sampling/sampling_logp_difference/max": 2.828670024871826, "sampling/sampling_logp_difference/mean": 0.48457419872283936, "step": 1045, "step_time": 23.92938039402361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5381645560264587, "epoch": 0.02092, "grad_norm": 0.0369880236685276, "kl": 0.3222961537539959, "learning_rate": 7.998488871667693e-06, "loss": -0.0693, "step": 1046, "step_time": 12.833126888988772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2507259100675583, "epoch": 0.02094, "frac_reward_zero_std": 0.25, "grad_norm": 0.06270001083612442, "kl": 0.5937852300703526, "learning_rate": 7.998485878104867e-06, "loss": -0.0342, "num_tokens": 26619892.0, "reward": 0.6117247343063354, "reward_std": 0.7615000009536743, "rewards/rollout_reward_func/mean": 0.6117247343063354, "rewards/rollout_reward_func/std": 0.7614999413490295, "sampling/importance_sampling_ratio/max": 1.144168734550476, "sampling/importance_sampling_ratio/mean": 0.8421649932861328, "sampling/importance_sampling_ratio/min": 4.996917414246127e-05, "sampling/sampling_logp_difference/max": 2.0706071853637695, "sampling/sampling_logp_difference/mean": 0.22396838665008545, "step": 1047, "step_time": 27.27035935301683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2523027881979942, "epoch": 0.02096, "grad_norm": 0.061970099806785583, "kl": 0.6035730242729187, "learning_rate": 7.998482881580582e-06, "loss": -0.0343, "step": 1048, "step_time": 15.109758466976928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.28125, "completions/mean_terminated_length": 4.481481552124023, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8727101683616638, "epoch": 0.02098, "frac_reward_zero_std": 0.125, "grad_norm": 0.06927274167537689, "kl": 0.28190189972519875, "learning_rate": 7.99847988209484e-06, "loss": -0.054, "num_tokens": 26668808.0, "reward": 0.4373977482318878, "reward_std": 0.8511765003204346, "rewards/rollout_reward_func/mean": 0.4373977482318878, "rewards/rollout_reward_func/std": 0.8511765003204346, "sampling/importance_sampling_ratio/max": 1.2722855806350708, "sampling/importance_sampling_ratio/mean": 0.7160035371780396, "sampling/importance_sampling_ratio/min": 3.3571686230970954e-07, "sampling/sampling_logp_difference/max": 1.8528797626495361, "sampling/sampling_logp_difference/mean": 0.3124728202819824, "step": 1049, "step_time": 22.87429269205313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8657435774803162, "epoch": 0.021, "grad_norm": 0.06819741427898407, "kl": 0.29032488353550434, "learning_rate": 7.998476879647645e-06, "loss": -0.0542, "step": 1050, "step_time": 10.92166253196774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.748972326517105, "epoch": 0.02102, "frac_reward_zero_std": 0.25, "grad_norm": 0.0949273407459259, "kl": 0.2307615429162979, "learning_rate": 7.998473874239e-06, "loss": -0.0699, "num_tokens": 26722297.0, "reward": 0.727798581123352, "reward_std": 0.8989815711975098, "rewards/rollout_reward_func/mean": 0.727798581123352, "rewards/rollout_reward_func/std": 0.8989814519882202, "sampling/importance_sampling_ratio/max": 1.321833848953247, "sampling/importance_sampling_ratio/mean": 0.7791686058044434, "sampling/importance_sampling_ratio/min": 2.782651972665917e-05, "sampling/sampling_logp_difference/max": 1.9408209323883057, "sampling/sampling_logp_difference/mean": 0.27771469950675964, "step": 1051, "step_time": 25.997310878970893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.7399112582206726, "epoch": 0.02104, "grad_norm": 0.09185885637998581, "kl": 0.22915450856089592, "learning_rate": 7.998470865868904e-06, "loss": -0.0701, "step": 1052, "step_time": 13.402965930028586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 9.6875, "completions/mean_terminated_length": 5.3684210777282715, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 3.120747148990631, "epoch": 0.02106, "frac_reward_zero_std": 0.0, "grad_norm": 0.08211395889520645, "kl": 0.2450927011668682, "learning_rate": 7.998467854537366e-06, "loss": -0.0539, "num_tokens": 26782300.0, "reward": 0.06554882973432541, "reward_std": 0.9044044017791748, "rewards/rollout_reward_func/mean": 0.06554882973432541, "rewards/rollout_reward_func/std": 0.9044044613838196, "sampling/importance_sampling_ratio/max": 1.3481863737106323, "sampling/importance_sampling_ratio/mean": 0.42476022243499756, "sampling/importance_sampling_ratio/min": 1.0088473345604143e-07, "sampling/sampling_logp_difference/max": 2.1980695724487305, "sampling/sampling_logp_difference/mean": 0.42930543422698975, "step": 1053, "step_time": 30.367436608969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.1131559014320374, "epoch": 0.02108, "grad_norm": 0.08023114502429962, "kl": 0.2540014125406742, "learning_rate": 7.998464840244384e-06, "loss": -0.0542, "step": 1054, "step_time": 13.290586729039205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 6.6875, "completions/mean_terminated_length": 4.538461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.147150084376335, "epoch": 0.0211, "frac_reward_zero_std": 0.0, "grad_norm": 0.11376174539327621, "kl": 0.2837623655796051, "learning_rate": 7.998461822989966e-06, "loss": -0.0629, "num_tokens": 26838086.0, "reward": 0.22656558454036713, "reward_std": 0.8079739809036255, "rewards/rollout_reward_func/mean": 0.22656558454036713, "rewards/rollout_reward_func/std": 0.8079739212989807, "sampling/importance_sampling_ratio/max": 1.2081842422485352, "sampling/importance_sampling_ratio/mean": 0.7552902698516846, "sampling/importance_sampling_ratio/min": 5.468453423418396e-07, "sampling/sampling_logp_difference/max": 2.03702712059021, "sampling/sampling_logp_difference/mean": 0.3383040726184845, "step": 1055, "step_time": 27.910055505053606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.1349093466997147, "epoch": 0.02112, "grad_norm": 0.11338750272989273, "kl": 0.29488058760762215, "learning_rate": 7.998458802774108e-06, "loss": -0.0634, "step": 1056, "step_time": 13.728110307012685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 6.4375, "completions/mean_terminated_length": 5.0714287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5533003360033035, "epoch": 0.02114, "frac_reward_zero_std": 0.125, "grad_norm": 0.050408776849508286, "kl": 0.3143523409962654, "learning_rate": 7.998455779596822e-06, "loss": -0.057, "num_tokens": 26900636.0, "reward": 0.6945868134498596, "reward_std": 0.8952638506889343, "rewards/rollout_reward_func/mean": 0.6945868134498596, "rewards/rollout_reward_func/std": 0.8952638506889343, "sampling/importance_sampling_ratio/max": 1.11403226852417, "sampling/importance_sampling_ratio/mean": 0.7639156579971313, "sampling/importance_sampling_ratio/min": 2.12801910492999e-06, "sampling/sampling_logp_difference/max": 1.9118437767028809, "sampling/sampling_logp_difference/mean": 0.3264917731285095, "step": 1057, "step_time": 27.90723491300014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5424674060195684, "epoch": 0.02116, "grad_norm": 0.04196055978536606, "kl": 0.31715821847319603, "learning_rate": 7.998452753458104e-06, "loss": -0.0572, "step": 1058, "step_time": 14.265135062014451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.53125, "completions/mean_terminated_length": 4.448276042938232, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3303243517875671, "epoch": 0.02118, "frac_reward_zero_std": 0.375, "grad_norm": 0.07080011814832687, "kl": 0.3390701524913311, "learning_rate": 7.998449724357959e-06, "loss": -0.0511, "num_tokens": 26947257.0, "reward": 0.4408890902996063, "reward_std": 0.8597273826599121, "rewards/rollout_reward_func/mean": 0.4408890902996063, "rewards/rollout_reward_func/std": 0.8597273230552673, "sampling/importance_sampling_ratio/max": 1.2493295669555664, "sampling/importance_sampling_ratio/mean": 0.853346049785614, "sampling/importance_sampling_ratio/min": 1.434311161574442e-05, "sampling/sampling_logp_difference/max": 2.0035653114318848, "sampling/sampling_logp_difference/mean": 0.21404127776622772, "step": 1059, "step_time": 24.019285420974484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3267102241516113, "epoch": 0.0212, "grad_norm": 0.08221380412578583, "kl": 0.34634919464588165, "learning_rate": 7.99844669229639e-06, "loss": -0.0509, "step": 1060, "step_time": 12.31325552702765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6896188259124756, "epoch": 0.02122, "frac_reward_zero_std": 0.125, "grad_norm": 0.08633940666913986, "kl": 0.5082866512238979, "learning_rate": 7.9984436572734e-06, "loss": -0.0456, "num_tokens": 27007203.0, "reward": 0.39084920287132263, "reward_std": 0.7476038932800293, "rewards/rollout_reward_func/mean": 0.39084920287132263, "rewards/rollout_reward_func/std": 0.7476037740707397, "sampling/importance_sampling_ratio/max": 1.148559331893921, "sampling/importance_sampling_ratio/mean": 0.75404953956604, "sampling/importance_sampling_ratio/min": 2.247970520841136e-08, "sampling/sampling_logp_difference/max": 2.57235050201416, "sampling/sampling_logp_difference/mean": 0.3150480389595032, "step": 1061, "step_time": 25.546733527007746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.683763787150383, "epoch": 0.02124, "grad_norm": 0.07890327274799347, "kl": 0.5235726460814476, "learning_rate": 7.998440619288992e-06, "loss": -0.0457, "step": 1062, "step_time": 13.577434709965019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6235468536615372, "epoch": 0.02126, "frac_reward_zero_std": 0.125, "grad_norm": 0.03281911462545395, "kl": 0.3277752622961998, "learning_rate": 7.99843757834317e-06, "loss": -0.0165, "num_tokens": 27075523.0, "reward": 0.5838991403579712, "reward_std": 0.6907845139503479, "rewards/rollout_reward_func/mean": 0.5838991403579712, "rewards/rollout_reward_func/std": 0.6907845139503479, "sampling/importance_sampling_ratio/max": 1.2721259593963623, "sampling/importance_sampling_ratio/mean": 0.9460448026657104, "sampling/importance_sampling_ratio/min": 0.0017779277404770255, "sampling/sampling_logp_difference/max": 1.9111251831054688, "sampling/sampling_logp_difference/mean": 0.1072411984205246, "step": 1063, "step_time": 28.285838363983203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6203840766102076, "epoch": 0.02128, "grad_norm": 0.032228030264377594, "kl": 0.32782114297151566, "learning_rate": 7.998434534435937e-06, "loss": -0.0164, "step": 1064, "step_time": 15.25550974800717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.103448390960693, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3309720233082771, "epoch": 0.0213, "frac_reward_zero_std": 0.125, "grad_norm": 0.19991187751293182, "kl": 1.4467405080795288, "learning_rate": 7.998431487567293e-06, "loss": -0.0281, "num_tokens": 27132275.0, "reward": 0.5188406705856323, "reward_std": 0.6730968356132507, "rewards/rollout_reward_func/mean": 0.5188406705856323, "rewards/rollout_reward_func/std": 0.673096776008606, "sampling/importance_sampling_ratio/max": 1.2450730800628662, "sampling/importance_sampling_ratio/mean": 0.8727625608444214, "sampling/importance_sampling_ratio/min": 1.2807925553470767e-10, "sampling/sampling_logp_difference/max": 2.8219261169433594, "sampling/sampling_logp_difference/mean": 0.2979428172111511, "step": 1065, "step_time": 26.716623009036994 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 1.3363534938544035, "epoch": 0.02132, "grad_norm": 0.17811855673789978, "kl": 1.2393005974590778, "learning_rate": 7.998428437737243e-06, "loss": -0.0287, "step": 1066, "step_time": 13.256639378989348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.25, "completions/mean_terminated_length": 4.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5323411226272583, "epoch": 0.02134, "frac_reward_zero_std": 0.25, "grad_norm": 0.13324058055877686, "kl": 0.41323620826005936, "learning_rate": 7.998425384945791e-06, "loss": -0.0283, "num_tokens": 27187024.0, "reward": 0.9052436351776123, "reward_std": 0.762991726398468, "rewards/rollout_reward_func/mean": 0.9052436351776123, "rewards/rollout_reward_func/std": 0.762991726398468, "sampling/importance_sampling_ratio/max": 1.1369876861572266, "sampling/importance_sampling_ratio/mean": 0.891570508480072, "sampling/importance_sampling_ratio/min": 0.043909914791584015, "sampling/sampling_logp_difference/max": 1.77877938747406, "sampling/sampling_logp_difference/mean": 0.10190825164318085, "step": 1067, "step_time": 21.44865873499657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5441726818680763, "epoch": 0.02136, "grad_norm": 0.13671743869781494, "kl": 0.403471015393734, "learning_rate": 7.99842232919294e-06, "loss": -0.0286, "step": 1068, "step_time": 11.894174950983142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.15625, "completions/mean_terminated_length": 4.15625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2583918608725071, "epoch": 0.02138, "frac_reward_zero_std": 0.25, "grad_norm": 0.24763251841068268, "kl": 0.2933778613805771, "learning_rate": 7.998419270478692e-06, "loss": -0.0137, "num_tokens": 27242846.0, "reward": 0.7562565207481384, "reward_std": 0.635489821434021, "rewards/rollout_reward_func/mean": 0.7562565207481384, "rewards/rollout_reward_func/std": 0.635489821434021, "sampling/importance_sampling_ratio/max": 1.1074646711349487, "sampling/importance_sampling_ratio/mean": 0.987720787525177, "sampling/importance_sampling_ratio/min": 0.12462781369686127, "sampling/sampling_logp_difference/max": 0.827072024345398, "sampling/sampling_logp_difference/mean": 0.03917225822806358, "step": 1069, "step_time": 23.47123632801231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.015625, "entropy": 0.2858065217733383, "epoch": 0.0214, "grad_norm": 0.08810113370418549, "kl": 0.2988518290221691, "learning_rate": 7.998416208803047e-06, "loss": -0.0151, "step": 1070, "step_time": 13.51584854198154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 5.625, "completions/mean_terminated_length": 4.142857551574707, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.1327569149434566, "epoch": 0.02142, "frac_reward_zero_std": 0.375, "grad_norm": 0.07549132406711578, "kl": 0.2809021472930908, "learning_rate": 7.998413144166016e-06, "loss": -0.0366, "num_tokens": 27293136.0, "reward": 0.5946047306060791, "reward_std": 0.791542112827301, "rewards/rollout_reward_func/mean": 0.5946047306060791, "rewards/rollout_reward_func/std": 0.791542112827301, "sampling/importance_sampling_ratio/max": 1.223168969154358, "sampling/importance_sampling_ratio/mean": 0.8717052340507507, "sampling/importance_sampling_ratio/min": 7.099329195625614e-07, "sampling/sampling_logp_difference/max": 1.916198968887329, "sampling/sampling_logp_difference/mean": 0.2301000952720642, "step": 1071, "step_time": 27.930244284973014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1538121849298477, "epoch": 0.02144, "grad_norm": 0.08872441947460175, "kl": 0.27547933906316757, "learning_rate": 7.998410076567594e-06, "loss": -0.0362, "step": 1072, "step_time": 14.813320423010737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.9375, "completions/mean_terminated_length": 4.5806450843811035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9224826768040657, "epoch": 0.02146, "frac_reward_zero_std": 0.375, "grad_norm": 0.12641464173793793, "kl": 0.37329863011837006, "learning_rate": 7.998407006007788e-06, "loss": -0.0185, "num_tokens": 27340131.0, "reward": 0.7194358110427856, "reward_std": 0.7006070613861084, "rewards/rollout_reward_func/mean": 0.7194358110427856, "rewards/rollout_reward_func/std": 0.7006070017814636, "sampling/importance_sampling_ratio/max": 2.6449341773986816, "sampling/importance_sampling_ratio/mean": 0.8538006544113159, "sampling/importance_sampling_ratio/min": 1.3160561138647608e-05, "sampling/sampling_logp_difference/max": 1.9369851350784302, "sampling/sampling_logp_difference/mean": 0.1848231554031372, "step": 1073, "step_time": 22.8195252729638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007352941203862429, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "entropy": 0.9536651521921158, "epoch": 0.02148, "grad_norm": 0.1443851739168167, "kl": 0.37582357972860336, "learning_rate": 7.9984039324866e-06, "loss": -0.02, "step": 1074, "step_time": 12.812539322010707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 5.6875, "completions/mean_terminated_length": 4.620689868927002, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2098187431693077, "epoch": 0.0215, "frac_reward_zero_std": 0.375, "grad_norm": 0.016597658395767212, "kl": 0.3293898329138756, "learning_rate": 7.998400856004034e-06, "loss": -0.052, "num_tokens": 27389556.0, "reward": 0.8925597667694092, "reward_std": 0.8140349388122559, "rewards/rollout_reward_func/mean": 0.8925597667694092, "rewards/rollout_reward_func/std": 0.8140349388122559, "sampling/importance_sampling_ratio/max": 1.2009241580963135, "sampling/importance_sampling_ratio/mean": 0.8382095098495483, "sampling/importance_sampling_ratio/min": 1.648407277343722e-07, "sampling/sampling_logp_difference/max": 2.1694400310516357, "sampling/sampling_logp_difference/mean": 0.2222779244184494, "step": 1075, "step_time": 26.92093068198301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.219782691448927, "epoch": 0.02152, "grad_norm": 0.018705856055021286, "kl": 0.32294769771397114, "learning_rate": 7.998397776560092e-06, "loss": -0.0519, "step": 1076, "step_time": 14.176866054011043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6276283264160156, "epoch": 0.02154, "frac_reward_zero_std": 0.0, "grad_norm": 0.18767094612121582, "kl": 0.3439094126224518, "learning_rate": 7.998394694154777e-06, "loss": -0.0651, "num_tokens": 27445827.0, "reward": 0.3940180540084839, "reward_std": 0.7609527111053467, "rewards/rollout_reward_func/mean": 0.3940180540084839, "rewards/rollout_reward_func/std": 0.7609527111053467, "sampling/importance_sampling_ratio/max": 1.146553874015808, "sampling/importance_sampling_ratio/mean": 0.7695446610450745, "sampling/importance_sampling_ratio/min": 1.869928269115917e-06, "sampling/sampling_logp_difference/max": 2.0337774753570557, "sampling/sampling_logp_difference/mean": 0.272880882024765, "step": 1077, "step_time": 24.06030154202017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.6414991915225983, "epoch": 0.02156, "grad_norm": 0.1014496386051178, "kl": 0.3461955487728119, "learning_rate": 7.998391608788093e-06, "loss": -0.065, "step": 1078, "step_time": 12.145283319958253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.516129016876221, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.792630597949028, "epoch": 0.02158, "frac_reward_zero_std": 0.125, "grad_norm": 0.13506098091602325, "kl": 0.49517859518527985, "learning_rate": 7.998388520460043e-06, "loss": -0.0268, "num_tokens": 27499670.0, "reward": 0.7682566046714783, "reward_std": 0.720321536064148, "rewards/rollout_reward_func/mean": 0.7682566046714783, "rewards/rollout_reward_func/std": 0.7203214764595032, "sampling/importance_sampling_ratio/max": 1.379648208618164, "sampling/importance_sampling_ratio/mean": 0.8573100566864014, "sampling/importance_sampling_ratio/min": 0.0007927768165245652, "sampling/sampling_logp_difference/max": 1.971888780593872, "sampling/sampling_logp_difference/mean": 0.13950708508491516, "step": 1079, "step_time": 25.411449026985792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7946001961827278, "epoch": 0.0216, "grad_norm": 0.13350899517536163, "kl": 0.4968513548374176, "learning_rate": 7.998385429170629e-06, "loss": -0.027, "step": 1080, "step_time": 13.311832242005039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.28125, "completions/mean_terminated_length": 4.1724138259887695, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8925391882658005, "epoch": 0.02162, "frac_reward_zero_std": 0.125, "grad_norm": 0.12241188436746597, "kl": 0.4446878284215927, "learning_rate": 7.998382334919855e-06, "loss": -0.0414, "num_tokens": 27556088.0, "reward": 0.6381006240844727, "reward_std": 0.8162346482276917, "rewards/rollout_reward_func/mean": 0.6381006240844727, "rewards/rollout_reward_func/std": 0.8162347078323364, "sampling/importance_sampling_ratio/max": 1.4653313159942627, "sampling/importance_sampling_ratio/mean": 0.8737901449203491, "sampling/importance_sampling_ratio/min": 0.00043112554703839123, "sampling/sampling_logp_difference/max": 1.846596121788025, "sampling/sampling_logp_difference/mean": 0.17270787060260773, "step": 1081, "step_time": 25.30094069999177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8961974605917931, "epoch": 0.02164, "grad_norm": 0.1036076471209526, "kl": 0.4410836808383465, "learning_rate": 7.998379237707723e-06, "loss": -0.0414, "step": 1082, "step_time": 12.97163926900248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1202869713306427, "epoch": 0.02166, "frac_reward_zero_std": 0.25, "grad_norm": 0.1947360634803772, "kl": 0.30720342695713043, "learning_rate": 7.998376137534238e-06, "loss": -0.0547, "num_tokens": 27599384.0, "reward": 0.878005862236023, "reward_std": 0.6981322765350342, "rewards/rollout_reward_func/mean": 0.878005862236023, "rewards/rollout_reward_func/std": 0.6981322169303894, "sampling/importance_sampling_ratio/max": 1.1196527481079102, "sampling/importance_sampling_ratio/mean": 0.8370190858840942, "sampling/importance_sampling_ratio/min": 0.0003491847310215235, "sampling/sampling_logp_difference/max": 2.020752429962158, "sampling/sampling_logp_difference/mean": 0.17284290492534637, "step": 1083, "step_time": 23.145790231006686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012962963432073593, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012962963432073593, "entropy": 1.1257643103599548, "epoch": 0.02168, "grad_norm": 0.1282072216272354, "kl": 0.30408868566155434, "learning_rate": 7.998373034399401e-06, "loss": -0.0551, "step": 1084, "step_time": 12.267178195033921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7902677655220032, "epoch": 0.0217, "frac_reward_zero_std": 0.5, "grad_norm": 0.030519407242536545, "kl": 0.44232327491045, "learning_rate": 7.998369928303215e-06, "loss": -0.0453, "num_tokens": 27649661.0, "reward": 0.7371412515640259, "reward_std": 0.7683196067810059, "rewards/rollout_reward_func/mean": 0.7371412515640259, "rewards/rollout_reward_func/std": 0.7683196067810059, "sampling/importance_sampling_ratio/max": 1.1277766227722168, "sampling/importance_sampling_ratio/mean": 0.7771874070167542, "sampling/importance_sampling_ratio/min": 3.76497439447121e-07, "sampling/sampling_logp_difference/max": 1.9056665897369385, "sampling/sampling_logp_difference/mean": 0.3183041214942932, "step": 1085, "step_time": 27.261427362973336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.794075034558773, "epoch": 0.02172, "grad_norm": 0.03255894035100937, "kl": 0.454578660428524, "learning_rate": 7.998366819245686e-06, "loss": -0.0452, "step": 1086, "step_time": 14.49771382502513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.5625, "completions/mean_terminated_length": 5.586206912994385, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3170899152755737, "epoch": 0.02174, "frac_reward_zero_std": 0.125, "grad_norm": 0.12455010414123535, "kl": 0.3983154445886612, "learning_rate": 7.998363707226812e-06, "loss": -0.048, "num_tokens": 27705429.0, "reward": 0.3988115191459656, "reward_std": 0.779182493686676, "rewards/rollout_reward_func/mean": 0.3988115191459656, "rewards/rollout_reward_func/std": 0.779182493686676, "sampling/importance_sampling_ratio/max": 1.1725982427597046, "sampling/importance_sampling_ratio/mean": 0.6647981405258179, "sampling/importance_sampling_ratio/min": 5.78876949930418e-07, "sampling/sampling_logp_difference/max": 2.799466133117676, "sampling/sampling_logp_difference/mean": 0.41240260004997253, "step": 1087, "step_time": 25.317932792007923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.312849462032318, "epoch": 0.02176, "grad_norm": 0.12448230385780334, "kl": 0.3965619131922722, "learning_rate": 7.998360592246602e-06, "loss": -0.0483, "step": 1088, "step_time": 13.468746687984094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.75, "completions/mean_terminated_length": 4.387096405029297, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.772451139986515, "epoch": 0.02178, "frac_reward_zero_std": 0.625, "grad_norm": 0.01769177056849003, "kl": 0.26972372084856033, "learning_rate": 7.998357474305054e-06, "loss": -0.0353, "num_tokens": 27748484.0, "reward": 1.2193032503128052, "reward_std": 0.5324323177337646, "rewards/rollout_reward_func/mean": 1.2193032503128052, "rewards/rollout_reward_func/std": 0.5324323177337646, "sampling/importance_sampling_ratio/max": 1.1016862392425537, "sampling/importance_sampling_ratio/mean": 0.9211240410804749, "sampling/importance_sampling_ratio/min": 3.052836109418422e-05, "sampling/sampling_logp_difference/max": 2.081810235977173, "sampling/sampling_logp_difference/mean": 0.1208634302020073, "step": 1089, "step_time": 20.93032357201446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7653825841844082, "epoch": 0.0218, "grad_norm": 0.017136555165052414, "kl": 0.2767358534038067, "learning_rate": 7.998354353402173e-06, "loss": -0.0353, "step": 1090, "step_time": 10.606426643003942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 5.100000381469727, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2171454280614853, "epoch": 0.02182, "frac_reward_zero_std": 0.5, "grad_norm": 0.03291885554790497, "kl": 0.2241809256374836, "learning_rate": 7.998351229537965e-06, "loss": -0.0374, "num_tokens": 27799272.0, "reward": 0.6522530913352966, "reward_std": 0.8640869855880737, "rewards/rollout_reward_func/mean": 0.6522530913352966, "rewards/rollout_reward_func/std": 0.864086925983429, "sampling/importance_sampling_ratio/max": 1.1614443063735962, "sampling/importance_sampling_ratio/mean": 0.7928065061569214, "sampling/importance_sampling_ratio/min": 6.412401125999168e-05, "sampling/sampling_logp_difference/max": 1.9097243547439575, "sampling/sampling_logp_difference/mean": 0.22183513641357422, "step": 1091, "step_time": 23.420676018024096 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2022949010133743, "epoch": 0.02184, "grad_norm": 0.030038485303521156, "kl": 0.22414066642522812, "learning_rate": 7.998348102712429e-06, "loss": -0.0375, "step": 1092, "step_time": 12.263187401986215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 4.65625, "completions/mean_terminated_length": 4.290322303771973, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3290872350335121, "epoch": 0.02186, "frac_reward_zero_std": 0.125, "grad_norm": 0.1145070269703865, "kl": 0.6030245721340179, "learning_rate": 7.998344972925568e-06, "loss": -0.0456, "num_tokens": 27856370.0, "reward": 0.4602869153022766, "reward_std": 0.7333014011383057, "rewards/rollout_reward_func/mean": 0.4602869153022766, "rewards/rollout_reward_func/std": 0.7333014011383057, "sampling/importance_sampling_ratio/max": 1.1709622144699097, "sampling/importance_sampling_ratio/mean": 0.7956472635269165, "sampling/importance_sampling_ratio/min": 1.6123217392305378e-06, "sampling/sampling_logp_difference/max": 2.452507495880127, "sampling/sampling_logp_difference/mean": 0.2532714605331421, "step": 1093, "step_time": 24.164293125999393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3148351944983006, "epoch": 0.02188, "grad_norm": 0.1083744466304779, "kl": 0.6391911804676056, "learning_rate": 7.998341840177388e-06, "loss": -0.0462, "step": 1094, "step_time": 12.793809358001454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.2424088418483734, "epoch": 0.0219, "frac_reward_zero_std": 0.5, "grad_norm": 0.05864923447370529, "kl": 0.9268348291516304, "learning_rate": 7.99833870446789e-06, "loss": -0.0231, "num_tokens": 27905985.0, "reward": 0.882832944393158, "reward_std": 0.8542923331260681, "rewards/rollout_reward_func/mean": 0.882832944393158, "rewards/rollout_reward_func/std": 0.8542923331260681, "sampling/importance_sampling_ratio/max": 1.1026644706726074, "sampling/importance_sampling_ratio/mean": 0.7517060041427612, "sampling/importance_sampling_ratio/min": 0.00011810395517386496, "sampling/sampling_logp_difference/max": 2.0247626304626465, "sampling/sampling_logp_difference/mean": 0.2597949206829071, "step": 1095, "step_time": 25.634151218022453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012620192486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192486792803, "entropy": 1.2374985069036484, "epoch": 0.02192, "grad_norm": 0.06579610705375671, "kl": 1.0027300864458084, "learning_rate": 7.998335565797079e-06, "loss": -0.0232, "step": 1096, "step_time": 12.95678011898417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.0, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.09409892931580544, "epoch": 0.02194, "frac_reward_zero_std": 0.75, "grad_norm": 0.07347926497459412, "kl": 0.28497979044914246, "learning_rate": 7.998332424164957e-06, "loss": 0.0017, "num_tokens": 27950412.0, "reward": 0.9530029892921448, "reward_std": 0.6509908437728882, "rewards/rollout_reward_func/mean": 0.9530029892921448, "rewards/rollout_reward_func/std": 0.650990903377533, "sampling/importance_sampling_ratio/max": 1.4457136392593384, "sampling/importance_sampling_ratio/mean": 1.0606276988983154, "sampling/importance_sampling_ratio/min": 1.0121327638626099, "sampling/sampling_logp_difference/max": 0.3012092113494873, "sampling/sampling_logp_difference/mean": 0.014195991680026054, "step": 1097, "step_time": 20.323779552971246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.09262185357511044, "epoch": 0.02196, "grad_norm": 0.102592334151268, "kl": 0.28479986637830734, "learning_rate": 7.998329279571525e-06, "loss": 0.0017, "step": 1098, "step_time": 11.738298381009372 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 4.222222328186035, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.348413284868002, "epoch": 0.02198, "frac_reward_zero_std": 0.125, "grad_norm": 0.12413953989744186, "kl": 0.26546506583690643, "learning_rate": 7.998326132016789e-06, "loss": -0.0266, "num_tokens": 28004813.0, "reward": 0.6041791439056396, "reward_std": 0.9307119846343994, "rewards/rollout_reward_func/mean": 0.6041791439056396, "rewards/rollout_reward_func/std": 0.9307119846343994, "sampling/importance_sampling_ratio/max": 1.287610411643982, "sampling/importance_sampling_ratio/mean": 0.8535808324813843, "sampling/importance_sampling_ratio/min": 0.0001926287659443915, "sampling/sampling_logp_difference/max": 1.812943935394287, "sampling/sampling_logp_difference/mean": 0.20698213577270508, "step": 1099, "step_time": 29.34696491100476 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3436143919825554, "epoch": 0.022, "grad_norm": 0.13607902824878693, "kl": 0.2646901197731495, "learning_rate": 7.99832298150075e-06, "loss": -0.0267, "step": 1100, "step_time": 14.361711322009796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 5.9375, "completions/mean_terminated_length": 4.0740742683410645, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3488357961177826, "epoch": 0.02202, "frac_reward_zero_std": 0.25, "grad_norm": 0.02785326912999153, "kl": 0.40911929681897163, "learning_rate": 7.998319828023415e-06, "loss": -0.0529, "num_tokens": 28060861.0, "reward": 0.7287079095840454, "reward_std": 0.8493397831916809, "rewards/rollout_reward_func/mean": 0.7287079095840454, "rewards/rollout_reward_func/std": 0.8493398427963257, "sampling/importance_sampling_ratio/max": 1.2402814626693726, "sampling/importance_sampling_ratio/mean": 0.7732706665992737, "sampling/importance_sampling_ratio/min": 0.00011736516171367839, "sampling/sampling_logp_difference/max": 1.9177192449569702, "sampling/sampling_logp_difference/mean": 0.23553913831710815, "step": 1101, "step_time": 28.54995312402025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.352272942662239, "epoch": 0.02204, "grad_norm": 0.02705436572432518, "kl": 0.3965403884649277, "learning_rate": 7.998316671584783e-06, "loss": -0.053, "step": 1102, "step_time": 14.592945585987763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.344827651977539, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2737927362322807, "epoch": 0.02206, "frac_reward_zero_std": 0.0, "grad_norm": 0.05287747457623482, "kl": 0.43940747529268265, "learning_rate": 7.998313512184858e-06, "loss": -0.0534, "num_tokens": 28121868.0, "reward": 0.36902573704719543, "reward_std": 0.7549419403076172, "rewards/rollout_reward_func/mean": 0.36902573704719543, "rewards/rollout_reward_func/std": 0.7549419403076172, "sampling/importance_sampling_ratio/max": 1.256159782409668, "sampling/importance_sampling_ratio/mean": 0.8136545419692993, "sampling/importance_sampling_ratio/min": 0.0001329082006122917, "sampling/sampling_logp_difference/max": 1.9109097719192505, "sampling/sampling_logp_difference/mean": 0.25350040197372437, "step": 1103, "step_time": 28.891561052034376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2829019129276276, "epoch": 0.02208, "grad_norm": 0.05705045163631439, "kl": 0.43675556033849716, "learning_rate": 7.998310349823643e-06, "loss": -0.0537, "step": 1104, "step_time": 14.791552835988114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 5.038461685180664, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.528081327676773, "epoch": 0.0221, "frac_reward_zero_std": 0.125, "grad_norm": 0.08593499660491943, "kl": 0.21491101384162903, "learning_rate": 7.998307184501144e-06, "loss": -0.049, "num_tokens": 28175764.0, "reward": 0.47877684235572815, "reward_std": 0.8556308746337891, "rewards/rollout_reward_func/mean": 0.47877684235572815, "rewards/rollout_reward_func/std": 0.8556308746337891, "sampling/importance_sampling_ratio/max": 1.173862099647522, "sampling/importance_sampling_ratio/mean": 0.6990156173706055, "sampling/importance_sampling_ratio/min": 1.3762414141638146e-07, "sampling/sampling_logp_difference/max": 2.4515979290008545, "sampling/sampling_logp_difference/mean": 0.42217445373535156, "step": 1105, "step_time": 23.547423752985196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.5295770168304443, "epoch": 0.02212, "grad_norm": 0.08839786052703857, "kl": 0.2098836824297905, "learning_rate": 7.99830401621736e-06, "loss": -0.0492, "step": 1106, "step_time": 11.923507185041672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.96875, "completions/mean_terminated_length": 4.535714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8890003561973572, "epoch": 0.02214, "frac_reward_zero_std": 0.25, "grad_norm": 0.09249245375394821, "kl": 0.34331994131207466, "learning_rate": 7.998300844972297e-06, "loss": -0.0479, "num_tokens": 28226713.0, "reward": 0.7733050584793091, "reward_std": 0.8317344188690186, "rewards/rollout_reward_func/mean": 0.7733050584793091, "rewards/rollout_reward_func/std": 0.8317344188690186, "sampling/importance_sampling_ratio/max": 1.3758128881454468, "sampling/importance_sampling_ratio/mean": 0.8515002727508545, "sampling/importance_sampling_ratio/min": 1.9250965976880252e-07, "sampling/sampling_logp_difference/max": 2.1588633060455322, "sampling/sampling_logp_difference/mean": 0.35099291801452637, "step": 1107, "step_time": 24.893224787985673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8833405375480652, "epoch": 0.02216, "grad_norm": 0.06763706356287003, "kl": 0.33005621656775475, "learning_rate": 7.998297670765958e-06, "loss": -0.0482, "step": 1108, "step_time": 13.570173405983951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.629629611968994, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.562959462404251, "epoch": 0.02218, "frac_reward_zero_std": 0.625, "grad_norm": 0.03534156084060669, "kl": 0.1496436707675457, "learning_rate": 7.998294493598344e-06, "loss": -0.0362, "num_tokens": 28273468.0, "reward": 0.5685393214225769, "reward_std": 0.8375031352043152, "rewards/rollout_reward_func/mean": 0.5685393214225769, "rewards/rollout_reward_func/std": 0.8375031352043152, "sampling/importance_sampling_ratio/max": 1.2751692533493042, "sampling/importance_sampling_ratio/mean": 0.8347992300987244, "sampling/importance_sampling_ratio/min": 5.208469519857317e-05, "sampling/sampling_logp_difference/max": 1.7406786680221558, "sampling/sampling_logp_difference/mean": 0.21728824079036713, "step": 1109, "step_time": 25.379040992003866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5629847347736359, "epoch": 0.0222, "grad_norm": 0.03649233281612396, "kl": 0.14908574149012566, "learning_rate": 7.998291313469458e-06, "loss": -0.0362, "step": 1110, "step_time": 12.777417598001193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.21875, "completions/mean_terminated_length": 4.8214287757873535, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6381218135356903, "epoch": 0.02222, "frac_reward_zero_std": 0.375, "grad_norm": 0.1431199610233307, "kl": 0.5050948336720467, "learning_rate": 7.998288130379307e-06, "loss": -0.036, "num_tokens": 28320632.0, "reward": 0.9196683764457703, "reward_std": 0.8248389959335327, "rewards/rollout_reward_func/mean": 0.9196683764457703, "rewards/rollout_reward_func/std": 0.8248389363288879, "sampling/importance_sampling_ratio/max": 1.6729766130447388, "sampling/importance_sampling_ratio/mean": 0.8146504163742065, "sampling/importance_sampling_ratio/min": 4.577649633574765e-06, "sampling/sampling_logp_difference/max": 1.5837699174880981, "sampling/sampling_logp_difference/mean": 0.25381138920783997, "step": 1111, "step_time": 25.4012013010215 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629629664123058, "entropy": 1.6384622156620026, "epoch": 0.02224, "grad_norm": 0.06515949964523315, "kl": 0.47936803102493286, "learning_rate": 7.99828494432789e-06, "loss": -0.0363, "step": 1112, "step_time": 11.843732475972502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 7.0, "completions/mean_terminated_length": 4.92307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.205238789319992, "epoch": 0.02226, "frac_reward_zero_std": 0.0, "grad_norm": 0.1452689915895462, "kl": 0.21935024112462997, "learning_rate": 7.998281755315212e-06, "loss": -0.0789, "num_tokens": 28377225.0, "reward": 0.5171390175819397, "reward_std": 0.9385491609573364, "rewards/rollout_reward_func/mean": 0.5171390175819397, "rewards/rollout_reward_func/std": 0.9385491013526917, "sampling/importance_sampling_ratio/max": 1.2537497282028198, "sampling/importance_sampling_ratio/mean": 0.7106320261955261, "sampling/importance_sampling_ratio/min": 1.1161161994621693e-09, "sampling/sampling_logp_difference/max": 2.1840171813964844, "sampling/sampling_logp_difference/mean": 0.44384610652923584, "step": 1113, "step_time": 24.38651788106654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.2074450254440308, "epoch": 0.02228, "grad_norm": 0.13683569431304932, "kl": 0.21789678931236267, "learning_rate": 7.998278563341278e-06, "loss": -0.0791, "step": 1114, "step_time": 12.112930297007551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.75, "completions/mean_terminated_length": 4.285714626312256, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5458642467856407, "epoch": 0.0223, "frac_reward_zero_std": 0.125, "grad_norm": 0.11226759105920792, "kl": 0.19040180183947086, "learning_rate": 7.998275368406089e-06, "loss": -0.0542, "num_tokens": 28430339.0, "reward": 0.8708473443984985, "reward_std": 0.7976356148719788, "rewards/rollout_reward_func/mean": 0.8708473443984985, "rewards/rollout_reward_func/std": 0.7976357340812683, "sampling/importance_sampling_ratio/max": 1.4776290655136108, "sampling/importance_sampling_ratio/mean": 0.8637360334396362, "sampling/importance_sampling_ratio/min": 1.3140369446773548e-05, "sampling/sampling_logp_difference/max": 1.5421441793441772, "sampling/sampling_logp_difference/mean": 0.23626402020454407, "step": 1115, "step_time": 24.447125046979636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5516539514064789, "epoch": 0.02232, "grad_norm": 0.13611872494220734, "kl": 0.18819576874375343, "learning_rate": 7.998272170509646e-06, "loss": -0.0548, "step": 1116, "step_time": 11.857671630044933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 8.84375, "completions/mean_terminated_length": 6.043478488922119, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.625297173857689, "epoch": 0.02234, "frac_reward_zero_std": 0.0, "grad_norm": 0.0758877843618393, "kl": 0.5655364952981472, "learning_rate": 7.998268969651956e-06, "loss": -0.0919, "num_tokens": 28489963.0, "reward": 0.4052591621875763, "reward_std": 0.9040600061416626, "rewards/rollout_reward_func/mean": 0.4052591621875763, "rewards/rollout_reward_func/std": 0.9040600061416626, "sampling/importance_sampling_ratio/max": 1.2276594638824463, "sampling/importance_sampling_ratio/mean": 0.5329102277755737, "sampling/importance_sampling_ratio/min": 5.515322865079497e-09, "sampling/sampling_logp_difference/max": 2.3061511516571045, "sampling/sampling_logp_difference/mean": 0.4736981987953186, "step": 1117, "step_time": 29.366923834983027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.6308436915278435, "epoch": 0.02236, "grad_norm": 0.06780033558607101, "kl": 0.5004335455596447, "learning_rate": 7.99826576583302e-06, "loss": -0.092, "step": 1118, "step_time": 13.446981922024861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 7.09375, "completions/mean_terminated_length": 4.599999904632568, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4877489805221558, "epoch": 0.02238, "frac_reward_zero_std": 0.375, "grad_norm": 0.038216814398765564, "kl": 0.49150192365050316, "learning_rate": 7.998262559052842e-06, "loss": -0.0567, "num_tokens": 28536135.0, "reward": 0.7950649857521057, "reward_std": 0.9211364388465881, "rewards/rollout_reward_func/mean": 0.7950649857521057, "rewards/rollout_reward_func/std": 0.9211363792419434, "sampling/importance_sampling_ratio/max": 1.3410158157348633, "sampling/importance_sampling_ratio/mean": 0.7054249048233032, "sampling/importance_sampling_ratio/min": 9.619513718917005e-08, "sampling/sampling_logp_difference/max": 2.2665250301361084, "sampling/sampling_logp_difference/mean": 0.46851998567581177, "step": 1119, "step_time": 25.470535318017937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4948328137397766, "epoch": 0.0224, "grad_norm": 0.036834005266427994, "kl": 0.48650673031806946, "learning_rate": 7.998259349311424e-06, "loss": -0.0567, "step": 1120, "step_time": 11.96611283501261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 8.09375, "completions/mean_terminated_length": 5.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.487827479839325, "epoch": 0.02242, "frac_reward_zero_std": 0.125, "grad_norm": 0.17547760903835297, "kl": 0.33628561720252037, "learning_rate": 7.998256136608771e-06, "loss": -0.0556, "num_tokens": 28594418.0, "reward": 0.34707725048065186, "reward_std": 0.8461357355117798, "rewards/rollout_reward_func/mean": 0.34707725048065186, "rewards/rollout_reward_func/std": 0.8461357355117798, "sampling/importance_sampling_ratio/max": 1.3557454347610474, "sampling/importance_sampling_ratio/mean": 0.553888201713562, "sampling/importance_sampling_ratio/min": 3.411147275755866e-08, "sampling/sampling_logp_difference/max": 1.9675228595733643, "sampling/sampling_logp_difference/mean": 0.4174252152442932, "step": 1121, "step_time": 30.39590103100636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4781713485717773, "epoch": 0.02244, "grad_norm": 0.10448397696018219, "kl": 0.33530886471271515, "learning_rate": 7.998252920944886e-06, "loss": -0.056, "step": 1122, "step_time": 14.152996843971778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 6.59375, "completions/mean_terminated_length": 4.42307710647583, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3284486532211304, "epoch": 0.02246, "frac_reward_zero_std": 0.25, "grad_norm": 0.5929411053657532, "kl": 0.272858627140522, "learning_rate": 7.998249702319772e-06, "loss": -0.0639, "num_tokens": 28640056.0, "reward": 0.6292762160301208, "reward_std": 0.8708489537239075, "rewards/rollout_reward_func/mean": 0.6292762160301208, "rewards/rollout_reward_func/std": 0.8708489537239075, "sampling/importance_sampling_ratio/max": 1.1682676076889038, "sampling/importance_sampling_ratio/mean": 0.6787203550338745, "sampling/importance_sampling_ratio/min": 2.7258002432972717e-07, "sampling/sampling_logp_difference/max": 1.887431025505066, "sampling/sampling_logp_difference/mean": 0.40401583909988403, "step": 1123, "step_time": 25.272984927956713 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027901785913854837, "entropy": 2.33186474442482, "epoch": 0.02248, "grad_norm": 0.07081106305122375, "kl": 0.38021063804626465, "learning_rate": 7.998246480733429e-06, "loss": -0.0648, "step": 1124, "step_time": 12.218430709996028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 8.875, "completions/mean_terminated_length": 5.636363983154297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.0165340304374695, "epoch": 0.0225, "frac_reward_zero_std": 0.125, "grad_norm": 0.24682104587554932, "kl": 0.12848404236137867, "learning_rate": 7.998243256185865e-06, "loss": -0.0742, "num_tokens": 28698536.0, "reward": -0.018592651933431625, "reward_std": 0.7641202807426453, "rewards/rollout_reward_func/mean": -0.018592651933431625, "rewards/rollout_reward_func/std": 0.7641202807426453, "sampling/importance_sampling_ratio/max": 1.4191499948501587, "sampling/importance_sampling_ratio/mean": 0.5418094396591187, "sampling/importance_sampling_ratio/min": 8.58454285435073e-08, "sampling/sampling_logp_difference/max": 2.0485239028930664, "sampling/sampling_logp_difference/mean": 0.4172929525375366, "step": 1125, "step_time": 32.04853898298461 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 2.9939374923706055, "epoch": 0.02252, "grad_norm": 0.2036566138267517, "kl": 0.12884909100830555, "learning_rate": 7.998240028677082e-06, "loss": -0.076, "step": 1126, "step_time": 14.000587100948906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.03125, "completions/mean_terminated_length": 5.370370388031006, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.16155669093132, "epoch": 0.02254, "frac_reward_zero_std": 0.0, "grad_norm": 0.09619291871786118, "kl": 0.30405715852975845, "learning_rate": 7.99823679820708e-06, "loss": -0.0772, "num_tokens": 28747873.0, "reward": 0.1518583744764328, "reward_std": 0.8697605729103088, "rewards/rollout_reward_func/mean": 0.1518583744764328, "rewards/rollout_reward_func/std": 0.8697605133056641, "sampling/importance_sampling_ratio/max": 1.2946019172668457, "sampling/importance_sampling_ratio/mean": 0.6643751263618469, "sampling/importance_sampling_ratio/min": 3.456017566350056e-06, "sampling/sampling_logp_difference/max": 2.234164237976074, "sampling/sampling_logp_difference/mean": 0.3700297474861145, "step": 1127, "step_time": 25.037365463009337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015817468985915184, "clip_ratio/low_min": 0.005319148767739534, "clip_ratio/region_mean": 0.015817468985915184, "entropy": 2.125289559364319, "epoch": 0.02256, "grad_norm": 0.06869763135910034, "kl": 0.31526441127061844, "learning_rate": 7.998233564775866e-06, "loss": -0.078, "step": 1128, "step_time": 11.00717575303861 }, { "clip_ratio/high_max": 0.009259259328246117, "clip_ratio/high_mean": 0.004629629664123058, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629629664123058, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.0625, "completions/mean_terminated_length": 5.034482955932617, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2611274421215057, "epoch": 0.02258, "frac_reward_zero_std": 0.0, "grad_norm": 0.12453935295343399, "kl": 0.28078699856996536, "learning_rate": 7.998230328383438e-06, "loss": -0.0749, "num_tokens": 28801649.0, "reward": 0.18674491345882416, "reward_std": 0.8326628804206848, "rewards/rollout_reward_func/mean": 0.18674491345882416, "rewards/rollout_reward_func/std": 0.83266282081604, "sampling/importance_sampling_ratio/max": 1.9923416376113892, "sampling/importance_sampling_ratio/mean": 0.6825085878372192, "sampling/importance_sampling_ratio/min": 4.0985972304952156e-07, "sampling/sampling_logp_difference/max": 2.1524598598480225, "sampling/sampling_logp_difference/mean": 0.4185503423213959, "step": 1129, "step_time": 23.171588855970185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.029302524402737617, "clip_ratio/low_min": 0.00657894741743803, "clip_ratio/region_mean": 0.029302524402737617, "entropy": 2.2409035861492157, "epoch": 0.0226, "grad_norm": 0.1084061935544014, "kl": 0.3146791644394398, "learning_rate": 7.998227089029806e-06, "loss": -0.0754, "step": 1130, "step_time": 12.105665979004698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.8125, "completions/mean_terminated_length": 4.357142925262451, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4497782588005066, "epoch": 0.02262, "frac_reward_zero_std": 0.375, "grad_norm": 0.03813375160098076, "kl": 0.6327742375433445, "learning_rate": 7.998223846714971e-06, "loss": -0.0425, "num_tokens": 28860888.0, "reward": 0.5669540166854858, "reward_std": 0.8289517760276794, "rewards/rollout_reward_func/mean": 0.5669540166854858, "rewards/rollout_reward_func/std": 0.8289517760276794, "sampling/importance_sampling_ratio/max": 1.2896777391433716, "sampling/importance_sampling_ratio/mean": 0.7781937122344971, "sampling/importance_sampling_ratio/min": 6.940663297427818e-05, "sampling/sampling_logp_difference/max": 1.5848692655563354, "sampling/sampling_logp_difference/mean": 0.23584261536598206, "step": 1131, "step_time": 29.163435828028014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4427136480808258, "epoch": 0.02264, "grad_norm": 0.04347524791955948, "kl": 0.7616783492267132, "learning_rate": 7.998220601438933e-06, "loss": -0.0424, "step": 1132, "step_time": 14.529810687992722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.875, "completions/mean_terminated_length": 4.133333683013916, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8055048771202564, "epoch": 0.02266, "frac_reward_zero_std": 0.375, "grad_norm": 0.10187334567308426, "kl": 0.4801923893392086, "learning_rate": 7.998217353201698e-06, "loss": -0.0397, "num_tokens": 28905231.0, "reward": 0.9052320122718811, "reward_std": 0.7591638565063477, "rewards/rollout_reward_func/mean": 0.9052320122718811, "rewards/rollout_reward_func/std": 0.7591637969017029, "sampling/importance_sampling_ratio/max": 1.0703636407852173, "sampling/importance_sampling_ratio/mean": 0.8924121856689453, "sampling/importance_sampling_ratio/min": 1.980673914658837e-06, "sampling/sampling_logp_difference/max": 1.6777421236038208, "sampling/sampling_logp_difference/mean": 0.16532586514949799, "step": 1133, "step_time": 20.466660729958676 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.8003844656050205, "epoch": 0.02268, "grad_norm": 0.05404944717884064, "kl": 0.49244656413793564, "learning_rate": 7.99821410200327e-06, "loss": -0.0398, "step": 1134, "step_time": 11.17385347597883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 4.5625, "completions/mean_terminated_length": 4.193548202514648, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6227080635726452, "epoch": 0.0227, "frac_reward_zero_std": 0.25, "grad_norm": 0.14596807956695557, "kl": 1.723814696073532, "learning_rate": 7.998210847843649e-06, "loss": -0.0249, "num_tokens": 28957843.0, "reward": 0.7297431230545044, "reward_std": 0.806954562664032, "rewards/rollout_reward_func/mean": 0.7297431230545044, "rewards/rollout_reward_func/std": 0.8069545030593872, "sampling/importance_sampling_ratio/max": 1.1521614789962769, "sampling/importance_sampling_ratio/mean": 0.90946364402771, "sampling/importance_sampling_ratio/min": 0.001644005300477147, "sampling/sampling_logp_difference/max": 2.136176586151123, "sampling/sampling_logp_difference/mean": 0.12253324687480927, "step": 1135, "step_time": 23.487450630025705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6180218383669853, "epoch": 0.02272, "grad_norm": 0.12126687914133072, "kl": 1.40399581938982, "learning_rate": 7.99820759072284e-06, "loss": -0.0257, "step": 1136, "step_time": 13.612939260987332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.09375, "completions/mean_terminated_length": 4.366666793823242, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0951606426388025, "epoch": 0.02274, "frac_reward_zero_std": 0.0, "grad_norm": 0.25021862983703613, "kl": 0.6917925849556923, "learning_rate": 7.998204330640846e-06, "loss": -0.0318, "num_tokens": 29014282.0, "reward": 0.5187352299690247, "reward_std": 0.6582895517349243, "rewards/rollout_reward_func/mean": 0.5187352299690247, "rewards/rollout_reward_func/std": 0.6582895517349243, "sampling/importance_sampling_ratio/max": 1.1603710651397705, "sampling/importance_sampling_ratio/mean": 0.827908456325531, "sampling/importance_sampling_ratio/min": 0.00023077544756233692, "sampling/sampling_logp_difference/max": 1.8457800149917603, "sampling/sampling_logp_difference/mean": 0.19411617517471313, "step": 1137, "step_time": 25.43214812898077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 1.1003573145717382, "epoch": 0.02276, "grad_norm": 0.17004908621311188, "kl": 0.7013780772686005, "learning_rate": 7.998201067597671e-06, "loss": -0.0328, "step": 1138, "step_time": 12.922182760958094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 4.375, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5131848100572824, "epoch": 0.02278, "frac_reward_zero_std": 0.5, "grad_norm": 0.025264866650104523, "kl": 0.4513426385819912, "learning_rate": 7.998197801593319e-06, "loss": -0.0212, "num_tokens": 29062887.0, "reward": 0.602891206741333, "reward_std": 0.8119417428970337, "rewards/rollout_reward_func/mean": 0.602891206741333, "rewards/rollout_reward_func/std": 0.8119416832923889, "sampling/importance_sampling_ratio/max": 1.1099953651428223, "sampling/importance_sampling_ratio/mean": 0.9612044095993042, "sampling/importance_sampling_ratio/min": 1.274583269150753e-06, "sampling/sampling_logp_difference/max": 1.8824548721313477, "sampling/sampling_logp_difference/mean": 0.11783641576766968, "step": 1139, "step_time": 17.955917716986733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5215652026236057, "epoch": 0.0228, "grad_norm": 0.029455306008458138, "kl": 0.4385251998901367, "learning_rate": 7.99819453262779e-06, "loss": -0.0213, "step": 1140, "step_time": 9.939023029030068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4717206060886383, "epoch": 0.02282, "frac_reward_zero_std": 0.375, "grad_norm": 0.1900545358657837, "kl": 1.517103150486946, "learning_rate": 7.99819126070109e-06, "loss": -0.0409, "num_tokens": 29109027.0, "reward": 0.41887179017066956, "reward_std": 0.8380548357963562, "rewards/rollout_reward_func/mean": 0.41887179017066956, "rewards/rollout_reward_func/std": 0.8380548357963562, "sampling/importance_sampling_ratio/max": 1.1388463973999023, "sampling/importance_sampling_ratio/mean": 0.7366805672645569, "sampling/importance_sampling_ratio/min": 6.067934776865513e-08, "sampling/sampling_logp_difference/max": 2.3149142265319824, "sampling/sampling_logp_difference/mean": 0.29134199023246765, "step": 1141, "step_time": 19.44446995703038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4756021350622177, "epoch": 0.02284, "grad_norm": 0.12562061846256256, "kl": 1.202589213848114, "learning_rate": 7.998187985813221e-06, "loss": -0.042, "step": 1142, "step_time": 10.389648615993792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.120020966976881, "epoch": 0.02286, "frac_reward_zero_std": 0.5, "grad_norm": 0.03623166307806969, "kl": 0.2266441285610199, "learning_rate": 7.998184707964188e-06, "loss": -0.0432, "num_tokens": 29161242.0, "reward": 1.089469075202942, "reward_std": 0.6816537380218506, "rewards/rollout_reward_func/mean": 1.089469075202942, "rewards/rollout_reward_func/std": 0.6816536784172058, "sampling/importance_sampling_ratio/max": 1.1063199043273926, "sampling/importance_sampling_ratio/mean": 0.9048371315002441, "sampling/importance_sampling_ratio/min": 1.9041271670516835e-08, "sampling/sampling_logp_difference/max": 2.417379856109619, "sampling/sampling_logp_difference/mean": 0.20060819387435913, "step": 1143, "step_time": 23.934334531018976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1277051353827119, "epoch": 0.02288, "grad_norm": 0.03535770997405052, "kl": 0.2272377386689186, "learning_rate": 7.998181427153991e-06, "loss": -0.0432, "step": 1144, "step_time": 11.944551457010675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.9615443088114262, "epoch": 0.0229, "frac_reward_zero_std": 0.375, "grad_norm": 0.07285106182098389, "kl": 0.362491887062788, "learning_rate": 7.998178143382636e-06, "loss": -0.0381, "num_tokens": 29216922.0, "reward": 0.40854907035827637, "reward_std": 0.7972254753112793, "rewards/rollout_reward_func/mean": 0.40854907035827637, "rewards/rollout_reward_func/std": 0.7972254753112793, "sampling/importance_sampling_ratio/max": 1.113387107849121, "sampling/importance_sampling_ratio/mean": 0.8371776342391968, "sampling/importance_sampling_ratio/min": 0.0011903219856321812, "sampling/sampling_logp_difference/max": 2.0691914558410645, "sampling/sampling_logp_difference/mean": 0.1638232171535492, "step": 1145, "step_time": 25.991481577046216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9679790884256363, "epoch": 0.02292, "grad_norm": 0.07843617349863052, "kl": 0.35415777564048767, "learning_rate": 7.998174856650125e-06, "loss": -0.0381, "step": 1146, "step_time": 14.178931563015794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.78125, "completions/mean_terminated_length": 4.4193549156188965, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.0572575777769089, "epoch": 0.02294, "frac_reward_zero_std": 0.125, "grad_norm": 0.18797412514686584, "kl": 0.8027616664767265, "learning_rate": 7.998171566956463e-06, "loss": -0.0286, "num_tokens": 29273480.0, "reward": 0.7659832239151001, "reward_std": 0.7295547127723694, "rewards/rollout_reward_func/mean": 0.7659832239151001, "rewards/rollout_reward_func/std": 0.7295547127723694, "sampling/importance_sampling_ratio/max": 1.4351329803466797, "sampling/importance_sampling_ratio/mean": 0.8784580826759338, "sampling/importance_sampling_ratio/min": 0.0005139604327268898, "sampling/sampling_logp_difference/max": 2.203354835510254, "sampling/sampling_logp_difference/mean": 0.2016032636165619, "step": 1147, "step_time": 26.57693406799808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0669124871492386, "epoch": 0.02296, "grad_norm": 0.06625621020793915, "kl": 0.7297957874834538, "learning_rate": 7.99816827430165e-06, "loss": -0.0292, "step": 1148, "step_time": 13.692948395968415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 14.0, "completions/mean_length": 5.21875, "completions/mean_terminated_length": 4.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.073913685977459, "epoch": 0.02298, "frac_reward_zero_std": 0.125, "grad_norm": 0.0989561676979065, "kl": 0.8020806796848774, "learning_rate": 7.998164978685692e-06, "loss": -0.0466, "num_tokens": 29328092.0, "reward": 0.6254901885986328, "reward_std": 0.7434563040733337, "rewards/rollout_reward_func/mean": 0.6254901885986328, "rewards/rollout_reward_func/std": 0.7434563040733337, "sampling/importance_sampling_ratio/max": 1.156158447265625, "sampling/importance_sampling_ratio/mean": 0.8829984664916992, "sampling/importance_sampling_ratio/min": 3.2277264836011454e-05, "sampling/sampling_logp_difference/max": 1.8612853288650513, "sampling/sampling_logp_difference/mean": 0.19174475967884064, "step": 1149, "step_time": 24.10626761900494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0878034681081772, "epoch": 0.023, "grad_norm": 0.09834446012973785, "kl": 0.7222829684615135, "learning_rate": 7.99816168010859e-06, "loss": -0.0469, "step": 1150, "step_time": 12.809172771027079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 4.78125, "completions/mean_terminated_length": 4.033333778381348, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.8774251118302345, "epoch": 0.02302, "frac_reward_zero_std": 0.25, "grad_norm": 0.08667312562465668, "kl": 0.3741007000207901, "learning_rate": 7.99815837857035e-06, "loss": -0.0431, "num_tokens": 29388211.0, "reward": 0.7997167110443115, "reward_std": 0.7770023345947266, "rewards/rollout_reward_func/mean": 0.7997167110443115, "rewards/rollout_reward_func/std": 0.7770022749900818, "sampling/importance_sampling_ratio/max": 1.134451150894165, "sampling/importance_sampling_ratio/mean": 0.8423101902008057, "sampling/importance_sampling_ratio/min": 0.005646315403282642, "sampling/sampling_logp_difference/max": 1.8360131978988647, "sampling/sampling_logp_difference/mean": 0.12511327862739563, "step": 1151, "step_time": 28.53697896003723 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015165441203862429, "entropy": 0.9012009799480438, "epoch": 0.02304, "grad_norm": 0.05896695703268051, "kl": 0.34226732328534126, "learning_rate": 7.998155074070974e-06, "loss": -0.0437, "step": 1152, "step_time": 14.717179817002034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.400000095367432, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2958465218544006, "epoch": 0.02306, "frac_reward_zero_std": 0.125, "grad_norm": 0.03178466856479645, "kl": 0.3469966873526573, "learning_rate": 7.998151766610464e-06, "loss": -0.0582, "num_tokens": 29443228.0, "reward": 0.8465031385421753, "reward_std": 0.7746825814247131, "rewards/rollout_reward_func/mean": 0.8465031385421753, "rewards/rollout_reward_func/std": 0.7746825814247131, "sampling/importance_sampling_ratio/max": 1.195271611213684, "sampling/importance_sampling_ratio/mean": 0.7856268882751465, "sampling/importance_sampling_ratio/min": 0.00014480615209322423, "sampling/sampling_logp_difference/max": 1.9937829971313477, "sampling/sampling_logp_difference/mean": 0.2412727177143097, "step": 1153, "step_time": 23.000508911034558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3017127960920334, "epoch": 0.02308, "grad_norm": 0.03504647687077522, "kl": 0.3300250545144081, "learning_rate": 7.998148456188825e-06, "loss": -0.0581, "step": 1154, "step_time": 12.143644196999958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.4375, "completions/mean_terminated_length": 4.344827651977539, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4275836944580078, "epoch": 0.0231, "frac_reward_zero_std": 0.125, "grad_norm": 0.047337379306554794, "kl": 0.3099314123392105, "learning_rate": 7.99814514280606e-06, "loss": -0.0649, "num_tokens": 29495354.0, "reward": 0.8078914880752563, "reward_std": 0.8333567976951599, "rewards/rollout_reward_func/mean": 0.8078914880752563, "rewards/rollout_reward_func/std": 0.8333567976951599, "sampling/importance_sampling_ratio/max": 1.1443885564804077, "sampling/importance_sampling_ratio/mean": 0.8453516364097595, "sampling/importance_sampling_ratio/min": 6.33769658975325e-08, "sampling/sampling_logp_difference/max": 2.253375291824341, "sampling/sampling_logp_difference/mean": 0.2482479214668274, "step": 1155, "step_time": 24.20589732096414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4422938749194145, "epoch": 0.02312, "grad_norm": 0.046982914209365845, "kl": 0.30133412033319473, "learning_rate": 7.998141826462172e-06, "loss": -0.0649, "step": 1156, "step_time": 12.995057798019843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 5.5625, "completions/mean_terminated_length": 4.482758522033691, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5118903145194054, "epoch": 0.02314, "frac_reward_zero_std": 0.0, "grad_norm": 0.09912441670894623, "kl": 0.2600381337106228, "learning_rate": 7.998138507157163e-06, "loss": -0.0592, "num_tokens": 29550074.0, "reward": 0.7099556922912598, "reward_std": 0.765694260597229, "rewards/rollout_reward_func/mean": 0.7099556922912598, "rewards/rollout_reward_func/std": 0.7656942009925842, "sampling/importance_sampling_ratio/max": 1.1104686260223389, "sampling/importance_sampling_ratio/mean": 0.7812092900276184, "sampling/importance_sampling_ratio/min": 8.8317410700256e-06, "sampling/sampling_logp_difference/max": 1.7229853868484497, "sampling/sampling_logp_difference/mean": 0.2472170740365982, "step": 1157, "step_time": 24.112435803981498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5130150616168976, "epoch": 0.02316, "grad_norm": 0.09679514914751053, "kl": 0.25164712965488434, "learning_rate": 7.998135184891039e-06, "loss": -0.0595, "step": 1158, "step_time": 12.721816305012908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 5.875, "completions/mean_terminated_length": 4.428571701049805, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8052792772650719, "epoch": 0.02318, "frac_reward_zero_std": 0.125, "grad_norm": 0.12490734457969666, "kl": 0.7821129746735096, "learning_rate": 7.998131859663801e-06, "loss": -0.0556, "num_tokens": 29603607.0, "reward": 0.58319491147995, "reward_std": 0.7852653861045837, "rewards/rollout_reward_func/mean": 0.58319491147995, "rewards/rollout_reward_func/std": 0.7852653861045837, "sampling/importance_sampling_ratio/max": 1.172229290008545, "sampling/importance_sampling_ratio/mean": 0.7896944284439087, "sampling/importance_sampling_ratio/min": 2.379622117132385e-07, "sampling/sampling_logp_difference/max": 2.1695094108581543, "sampling/sampling_logp_difference/mean": 0.33052802085876465, "step": 1159, "step_time": 26.243435201991815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8025582507252693, "epoch": 0.0232, "grad_norm": 0.10930921137332916, "kl": 0.7181221470236778, "learning_rate": 7.998128531475453e-06, "loss": -0.056, "step": 1160, "step_time": 13.137200611963635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 12.0, "completions/mean_length": 5.78125, "completions/mean_terminated_length": 4.724137783050537, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.810909129679203, "epoch": 0.02322, "frac_reward_zero_std": 0.125, "grad_norm": 0.03959612548351288, "kl": 0.4016019329428673, "learning_rate": 7.998125200326e-06, "loss": -0.0522, "num_tokens": 29650026.0, "reward": 0.7716718912124634, "reward_std": 0.8958374261856079, "rewards/rollout_reward_func/mean": 0.7716718912124634, "rewards/rollout_reward_func/std": 0.8958374261856079, "sampling/importance_sampling_ratio/max": 1.2238246202468872, "sampling/importance_sampling_ratio/mean": 0.7911500930786133, "sampling/importance_sampling_ratio/min": 9.854360172312226e-08, "sampling/sampling_logp_difference/max": 1.8983142375946045, "sampling/sampling_logp_difference/mean": 0.3421778082847595, "step": 1161, "step_time": 20.248139678966254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8074566796422005, "epoch": 0.02324, "grad_norm": 0.03716335445642471, "kl": 0.4004796966910362, "learning_rate": 7.998121866215441e-06, "loss": -0.0522, "step": 1162, "step_time": 10.701915811019717 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.28125, "completions/max_length": 16.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 8.25, "completions/mean_terminated_length": 5.217391490936279, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6794314980506897, "epoch": 0.02326, "frac_reward_zero_std": 0.125, "grad_norm": 0.17868199944496155, "kl": 0.18945042043924332, "learning_rate": 7.998118529143782e-06, "loss": -0.0628, "num_tokens": 29697043.0, "reward": 0.3873021900653839, "reward_std": 0.9777090549468994, "rewards/rollout_reward_func/mean": 0.3873021900653839, "rewards/rollout_reward_func/std": 0.9777090549468994, "sampling/importance_sampling_ratio/max": 1.060271143913269, "sampling/importance_sampling_ratio/mean": 0.5226738452911377, "sampling/importance_sampling_ratio/min": 8.858582134507742e-08, "sampling/sampling_logp_difference/max": 2.1605260372161865, "sampling/sampling_logp_difference/mean": 0.42915594577789307, "step": 1163, "step_time": 23.92858311199234 }, { "clip_ratio/high_max": 0.029361264314502478, "clip_ratio/high_mean": 0.014680632157251239, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014680632157251239, "entropy": 2.654804229736328, "epoch": 0.02328, "grad_norm": 0.12617771327495575, "kl": 0.18745872378349304, "learning_rate": 7.998115189111029e-06, "loss": -0.064, "step": 1164, "step_time": 10.799130539991893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.21875, "completions/mean_terminated_length": 4.759999752044678, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.6769051551818848, "epoch": 0.0233, "frac_reward_zero_std": 0.0, "grad_norm": 0.09340842813253403, "kl": 0.27603979408741, "learning_rate": 7.99811184611718e-06, "loss": -0.0842, "num_tokens": 29749199.0, "reward": 0.49869734048843384, "reward_std": 0.8954190015792847, "rewards/rollout_reward_func/mean": 0.49869734048843384, "rewards/rollout_reward_func/std": 0.8954190015792847, "sampling/importance_sampling_ratio/max": 1.0995142459869385, "sampling/importance_sampling_ratio/mean": 0.5188831686973572, "sampling/importance_sampling_ratio/min": 2.7894710541431778e-08, "sampling/sampling_logp_difference/max": 2.377974033355713, "sampling/sampling_logp_difference/mean": 0.43134891986846924, "step": 1165, "step_time": 30.02998623100575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.666404902935028, "epoch": 0.02332, "grad_norm": 0.09462405741214752, "kl": 0.2811807878315449, "learning_rate": 7.998108500162241e-06, "loss": -0.0848, "step": 1166, "step_time": 15.437519247003365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 7.34375, "completions/mean_terminated_length": 4.458333492279053, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3907927870750427, "epoch": 0.02334, "frac_reward_zero_std": 0.125, "grad_norm": 0.0957905575633049, "kl": 0.1590907983481884, "learning_rate": 7.998105151246216e-06, "loss": -0.0639, "num_tokens": 29806804.0, "reward": 0.6181223392486572, "reward_std": 0.8699783682823181, "rewards/rollout_reward_func/mean": 0.6181223392486572, "rewards/rollout_reward_func/std": 0.8699783682823181, "sampling/importance_sampling_ratio/max": 1.2714594602584839, "sampling/importance_sampling_ratio/mean": 0.7324692606925964, "sampling/importance_sampling_ratio/min": 4.5703909563599154e-06, "sampling/sampling_logp_difference/max": 2.8896384239196777, "sampling/sampling_logp_difference/mean": 0.35393834114074707, "step": 1167, "step_time": 29.29392401498626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.378775417804718, "epoch": 0.02336, "grad_norm": 0.08892199397087097, "kl": 0.15900717861950397, "learning_rate": 7.998101799369107e-06, "loss": -0.0643, "step": 1168, "step_time": 14.061335890961345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 5.3125, "completions/mean_terminated_length": 4.2068963050842285, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2286119852215052, "epoch": 0.02338, "frac_reward_zero_std": 0.125, "grad_norm": 0.19731298089027405, "kl": 0.24600958079099655, "learning_rate": 7.998098444530917e-06, "loss": -0.0294, "num_tokens": 29863226.0, "reward": 0.623046875, "reward_std": 0.8377927541732788, "rewards/rollout_reward_func/mean": 0.623046875, "rewards/rollout_reward_func/std": 0.8377927541732788, "sampling/importance_sampling_ratio/max": 1.4935007095336914, "sampling/importance_sampling_ratio/mean": 0.9033334255218506, "sampling/importance_sampling_ratio/min": 0.00035367871169000864, "sampling/sampling_logp_difference/max": 2.2579221725463867, "sampling/sampling_logp_difference/mean": 0.20588576793670654, "step": 1169, "step_time": 25.59597409600974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2265979554504156, "epoch": 0.0234, "grad_norm": 0.2282046228647232, "kl": 0.24637491255998611, "learning_rate": 7.998095086731653e-06, "loss": -0.0302, "step": 1170, "step_time": 12.925204034982016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.9375, "completions/mean_terminated_length": 4.84615421295166, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.058445245027542, "epoch": 0.02342, "frac_reward_zero_std": 0.125, "grad_norm": 0.20781917870044708, "kl": 0.302696093916893, "learning_rate": 7.998091725971311e-06, "loss": -0.0599, "num_tokens": 29916566.0, "reward": 0.4785679578781128, "reward_std": 0.8577658534049988, "rewards/rollout_reward_func/mean": 0.4785679578781128, "rewards/rollout_reward_func/std": 0.8577658534049988, "sampling/importance_sampling_ratio/max": 1.4910887479782104, "sampling/importance_sampling_ratio/mean": 0.698736310005188, "sampling/importance_sampling_ratio/min": 0.00010960427607642487, "sampling/sampling_logp_difference/max": 2.226334571838379, "sampling/sampling_logp_difference/mean": 0.3372885584831238, "step": 1171, "step_time": 25.450121925008716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.0483752489089966, "epoch": 0.02344, "grad_norm": 0.17137284576892853, "kl": 0.30464838817715645, "learning_rate": 7.998088362249902e-06, "loss": -0.0606, "step": 1172, "step_time": 12.030167882010574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 8.3125, "completions/mean_terminated_length": 4.285714149475098, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4040569365024567, "epoch": 0.02346, "frac_reward_zero_std": 0.0, "grad_norm": 0.061511751264333725, "kl": 0.3508271872997284, "learning_rate": 7.998084995567425e-06, "loss": -0.0602, "num_tokens": 29977389.0, "reward": 0.13967277109622955, "reward_std": 0.8527383208274841, "rewards/rollout_reward_func/mean": 0.13967277109622955, "rewards/rollout_reward_func/std": 0.8527382612228394, "sampling/importance_sampling_ratio/max": 1.1913199424743652, "sampling/importance_sampling_ratio/mean": 0.5831161737442017, "sampling/importance_sampling_ratio/min": 1.2765777057666128e-07, "sampling/sampling_logp_difference/max": 2.087965488433838, "sampling/sampling_logp_difference/mean": 0.3846375644207001, "step": 1173, "step_time": 30.44617820699932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.4036976397037506, "epoch": 0.02348, "grad_norm": 0.06062868982553482, "kl": 0.3503169119358063, "learning_rate": 7.998081625923884e-06, "loss": -0.06, "step": 1174, "step_time": 13.827976584027056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.25, "completions/mean_terminated_length": 4.44444465637207, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8470922857522964, "epoch": 0.0235, "frac_reward_zero_std": 0.25, "grad_norm": 0.02902926504611969, "kl": 0.4224323481321335, "learning_rate": 7.998078253319283e-06, "loss": -0.0608, "num_tokens": 30030784.0, "reward": 0.7103447318077087, "reward_std": 0.9003363847732544, "rewards/rollout_reward_func/mean": 0.7103447318077087, "rewards/rollout_reward_func/std": 0.9003363251686096, "sampling/importance_sampling_ratio/max": 1.1046640872955322, "sampling/importance_sampling_ratio/mean": 0.7134368419647217, "sampling/importance_sampling_ratio/min": 2.4998706749101984e-07, "sampling/sampling_logp_difference/max": 2.6504554748535156, "sampling/sampling_logp_difference/mean": 0.3497793674468994, "step": 1175, "step_time": 28.36210454502725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8508355468511581, "epoch": 0.02352, "grad_norm": 0.027256323024630547, "kl": 0.4093067869544029, "learning_rate": 7.998074877753625e-06, "loss": -0.0608, "step": 1176, "step_time": 15.124795711017214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 4.71875, "completions/mean_terminated_length": 4.3548383712768555, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.6953024715185165, "epoch": 0.02354, "frac_reward_zero_std": 0.5, "grad_norm": 0.036001648753881454, "kl": 0.31843333691358566, "learning_rate": 7.998071499226914e-06, "loss": -0.0439, "num_tokens": 30078496.0, "reward": 1.1451990604400635, "reward_std": 0.6054174304008484, "rewards/rollout_reward_func/mean": 1.1451990604400635, "rewards/rollout_reward_func/std": 0.6054174304008484, "sampling/importance_sampling_ratio/max": 1.368852138519287, "sampling/importance_sampling_ratio/mean": 0.9213640093803406, "sampling/importance_sampling_ratio/min": 4.008363976026885e-05, "sampling/sampling_logp_difference/max": 1.8562430143356323, "sampling/sampling_logp_difference/mean": 0.14237795770168304, "step": 1177, "step_time": 26.547151101956842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6949584782123566, "epoch": 0.02356, "grad_norm": 0.036512646824121475, "kl": 0.32256684824824333, "learning_rate": 7.99806811773915e-06, "loss": -0.0439, "step": 1178, "step_time": 13.793802164000226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.629629611968994, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5937134772539139, "epoch": 0.02358, "frac_reward_zero_std": 0.125, "grad_norm": 0.07064198702573776, "kl": 0.3204721622169018, "learning_rate": 7.998064733290341e-06, "loss": -0.0676, "num_tokens": 30134692.0, "reward": 0.3737450838088989, "reward_std": 0.7660006284713745, "rewards/rollout_reward_func/mean": 0.3737450838088989, "rewards/rollout_reward_func/std": 0.7660005688667297, "sampling/importance_sampling_ratio/max": 1.1290276050567627, "sampling/importance_sampling_ratio/mean": 0.7882899045944214, "sampling/importance_sampling_ratio/min": 4.786893015307214e-08, "sampling/sampling_logp_difference/max": 2.3172850608825684, "sampling/sampling_logp_difference/mean": 0.3201284110546112, "step": 1179, "step_time": 26.682660551014123 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.5820052176713943, "epoch": 0.0236, "grad_norm": 0.0451628752052784, "kl": 0.3273255005478859, "learning_rate": 7.998061345880489e-06, "loss": -0.0679, "step": 1180, "step_time": 13.23186123702908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.0625, "completions/mean_terminated_length": 5.407407283782959, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 2.3537547290325165, "epoch": 0.02362, "frac_reward_zero_std": 0.0, "grad_norm": 0.08741404116153717, "kl": 0.30147722363471985, "learning_rate": 7.998057955509595e-06, "loss": -0.0686, "num_tokens": 30191085.0, "reward": 0.3431262671947479, "reward_std": 0.8241440653800964, "rewards/rollout_reward_func/mean": 0.3431262671947479, "rewards/rollout_reward_func/std": 0.8241440057754517, "sampling/importance_sampling_ratio/max": 1.2087846994400024, "sampling/importance_sampling_ratio/mean": 0.6840640902519226, "sampling/importance_sampling_ratio/min": 5.911257403568015e-07, "sampling/sampling_logp_difference/max": 2.2103848457336426, "sampling/sampling_logp_difference/mean": 0.37026312947273254, "step": 1181, "step_time": 28.701865264039952 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 2.3392516672611237, "epoch": 0.02364, "grad_norm": 0.04557232931256294, "kl": 0.31625743955373764, "learning_rate": 7.998054562177666e-06, "loss": -0.0687, "step": 1182, "step_time": 15.29218527401099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 6.71875, "completions/mean_terminated_length": 4.119999885559082, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.7918213605880737, "epoch": 0.02366, "frac_reward_zero_std": 0.25, "grad_norm": 0.11490709334611893, "kl": 0.24551334604620934, "learning_rate": 7.998051165884703e-06, "loss": -0.0628, "num_tokens": 30244729.0, "reward": 0.5440245866775513, "reward_std": 0.8965389728546143, "rewards/rollout_reward_func/mean": 0.5440245866775513, "rewards/rollout_reward_func/std": 0.8965389728546143, "sampling/importance_sampling_ratio/max": 1.14948570728302, "sampling/importance_sampling_ratio/mean": 0.7861847877502441, "sampling/importance_sampling_ratio/min": 1.7600248611415736e-05, "sampling/sampling_logp_difference/max": 1.9075422286987305, "sampling/sampling_logp_difference/mean": 0.2973445653915405, "step": 1183, "step_time": 29.303610094997566 }, { "clip_ratio/high_max": 0.024553571827709675, "clip_ratio/high_mean": 0.012276785913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 1.7888547778129578, "epoch": 0.02368, "grad_norm": 0.019197899848222733, "kl": 0.24890637025237083, "learning_rate": 7.998047766630709e-06, "loss": -0.0632, "step": 1184, "step_time": 14.798328956996556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16.0, "completions/max_terminated_length": 4.0, "completions/mean_length": 5.125, "completions/mean_terminated_length": 4.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.3456225097179413, "epoch": 0.0237, "frac_reward_zero_std": 0.375, "grad_norm": 0.14845843613147736, "kl": 0.5742854848504066, "learning_rate": 7.998044364415687e-06, "loss": -0.0427, "num_tokens": 30294218.0, "reward": 0.8497551679611206, "reward_std": 0.665277898311615, "rewards/rollout_reward_func/mean": 0.8497551679611206, "rewards/rollout_reward_func/std": 0.665277898311615, "sampling/importance_sampling_ratio/max": 1.2805085182189941, "sampling/importance_sampling_ratio/mean": 0.9494084119796753, "sampling/importance_sampling_ratio/min": 1.945419292326278e-08, "sampling/sampling_logp_difference/max": 1.969653844833374, "sampling/sampling_logp_difference/mean": 0.29486995935440063, "step": 1185, "step_time": 24.03237672802061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3446962758898735, "epoch": 0.02372, "grad_norm": 0.13286927342414856, "kl": 0.5363965295255184, "learning_rate": 7.998040959239642e-06, "loss": -0.0435, "step": 1186, "step_time": 12.62170800298918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 7.125, "completions/mean_terminated_length": 5.076923370361328, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.8658650517463684, "epoch": 0.02374, "frac_reward_zero_std": 0.375, "grad_norm": 0.059194598346948624, "kl": 0.3139299973845482, "learning_rate": 7.998037551102578e-06, "loss": -0.0552, "num_tokens": 30345278.0, "reward": 0.7018178701400757, "reward_std": 0.8110529184341431, "rewards/rollout_reward_func/mean": 0.7018178701400757, "rewards/rollout_reward_func/std": 0.8110529184341431, "sampling/importance_sampling_ratio/max": 1.440158724784851, "sampling/importance_sampling_ratio/mean": 0.7266236543655396, "sampling/importance_sampling_ratio/min": 1.1777557119785342e-05, "sampling/sampling_logp_difference/max": 2.035083293914795, "sampling/sampling_logp_difference/mean": 0.3186987042427063, "step": 1187, "step_time": 26.47220459402888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8641211986541748, "epoch": 0.02376, "grad_norm": 0.054855167865753174, "kl": 0.34488317742943764, "learning_rate": 7.998034140004497e-06, "loss": -0.0553, "step": 1188, "step_time": 13.08989912699326 }, { "clip_ratio/high_max": 0.011904762126505375, "clip_ratio/high_mean": 0.0059523810632526875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 6.78125, "completions/mean_terminated_length": 4.653846263885498, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.16147643327713, "epoch": 0.02378, "frac_reward_zero_std": 0.125, "grad_norm": 0.15242820978164673, "kl": 0.9938031956553459, "learning_rate": 7.998030725945402e-06, "loss": -0.0511, "num_tokens": 30407437.0, "reward": 0.5789085626602173, "reward_std": 0.8508510589599609, "rewards/rollout_reward_func/mean": 0.5789085626602173, "rewards/rollout_reward_func/std": 0.8508509993553162, "sampling/importance_sampling_ratio/max": 1.2740510702133179, "sampling/importance_sampling_ratio/mean": 0.7266627550125122, "sampling/importance_sampling_ratio/min": 1.6830758795549627e-06, "sampling/sampling_logp_difference/max": 2.5871644020080566, "sampling/sampling_logp_difference/mean": 0.38665151596069336, "step": 1189, "step_time": 30.649493397999322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.159696012735367, "epoch": 0.0238, "grad_norm": 0.13960708677768707, "kl": 0.9373342767357826, "learning_rate": 7.998027308925297e-06, "loss": -0.0514, "step": 1190, "step_time": 15.378993858030299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16.0, "completions/max_terminated_length": 15.0, "completions/mean_length": 6.125, "completions/mean_terminated_length": 4.714285850524902, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.4252260029315948, "epoch": 0.02382, "frac_reward_zero_std": 0.25, "grad_norm": 0.015505895018577576, "kl": 0.2899159174412489, "learning_rate": 7.998023888944185e-06, "loss": -0.046, "num_tokens": 30461441.0, "reward": 0.6345828771591187, "reward_std": 0.8857829570770264, "rewards/rollout_reward_func/mean": 0.6345828771591187, "rewards/rollout_reward_func/std": 0.8857829570770264, "sampling/importance_sampling_ratio/max": 1.1032582521438599, "sampling/importance_sampling_ratio/mean": 0.8191916942596436, "sampling/importance_sampling_ratio/min": 6.321570253930986e-07, "sampling/sampling_logp_difference/max": 2.146472215652466, "sampling/sampling_logp_difference/mean": 0.25937336683273315, "step": 1191, "step_time": 32.97276619702461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4259701669216156, "epoch": 0.02384, "grad_norm": 0.016352538019418716, "kl": 0.297029547393322, "learning_rate": 7.99802046600207e-06, "loss": -0.046, "step": 1192, "step_time": 17.10879143301281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.40625, "completions/mean_terminated_length": 4.629629611968994, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.5249155461788177, "epoch": 0.02386, "frac_reward_zero_std": 0.25, "grad_norm": 0.03435295820236206, "kl": 0.32698993012309074, "learning_rate": 7.998017040098955e-06, "loss": -0.0682, "num_tokens": 30511618.0, "reward": 0.6309331059455872, "reward_std": 0.9007253050804138, "rewards/rollout_reward_func/mean": 0.6309331059455872, "rewards/rollout_reward_func/std": 0.900725245475769, "sampling/importance_sampling_ratio/max": 1.0792534351348877, "sampling/importance_sampling_ratio/mean": 0.7556843757629395, "sampling/importance_sampling_ratio/min": 0.00032186449971050024, "sampling/sampling_logp_difference/max": 2.010380983352661, "sampling/sampling_logp_difference/mean": 0.2288632094860077, "step": 1193, "step_time": 26.08036249698489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.524398297071457, "epoch": 0.02388, "grad_norm": 0.033644743263721466, "kl": 0.31550903990864754, "learning_rate": 7.998013611234844e-06, "loss": -0.0683, "step": 1194, "step_time": 12.376534165989142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 16.0, "completions/mean_length": 6.625, "completions/mean_terminated_length": 4.461538791656494, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.482279509305954, "epoch": 0.0239, "frac_reward_zero_std": 0.25, "grad_norm": 0.05349719151854515, "kl": 0.22240330651402473, "learning_rate": 7.998010179409741e-06, "loss": -0.0357, "num_tokens": 30567450.0, "reward": 0.376521497964859, "reward_std": 0.787101149559021, "rewards/rollout_reward_func/mean": 0.376521497964859, "rewards/rollout_reward_func/std": 0.787101149559021, "sampling/importance_sampling_ratio/max": 1.1932858228683472, "sampling/importance_sampling_ratio/mean": 0.7847068905830383, "sampling/importance_sampling_ratio/min": 1.1466491741884965e-05, "sampling/sampling_logp_difference/max": 1.6509203910827637, "sampling/sampling_logp_difference/mean": 0.20610468089580536, "step": 1195, "step_time": 28.770577330054948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.484295278787613, "epoch": 0.02392, "grad_norm": 0.05692676827311516, "kl": 0.21942738071084023, "learning_rate": 7.998006744623647e-06, "loss": -0.0358, "step": 1196, "step_time": 13.097159673983697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16.0, "completions/max_terminated_length": 10.0, "completions/mean_length": 6.65625, "completions/mean_terminated_length": 4.5, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.6466507837176323, "epoch": 0.02394, "frac_reward_zero_std": 0.0, "grad_norm": 0.02925686165690422, "kl": 0.48080096021294594, "learning_rate": 7.998003306876566e-06, "loss": -0.0776, "num_tokens": 30620414.0, "reward": 0.7950718998908997, "reward_std": 0.7662052512168884, "rewards/rollout_reward_func/mean": 0.7950718998908997, "rewards/rollout_reward_func/std": 0.7662052512168884, "sampling/importance_sampling_ratio/max": 1.1924464702606201, "sampling/importance_sampling_ratio/mean": 0.7674208879470825, "sampling/importance_sampling_ratio/min": 4.0358650039706845e-06, "sampling/sampling_logp_difference/max": 1.9830846786499023, "sampling/sampling_logp_difference/mean": 0.3255850374698639, "step": 1197, "step_time": 24.980053517007036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6460580676794052, "epoch": 0.02396, "grad_norm": 0.028755372390151024, "kl": 0.46554362028837204, "learning_rate": 7.997999866168502e-06, "loss": -0.0776, "step": 1198, "step_time": 12.159342025028309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 4.90625, "completions/mean_terminated_length": 4.1666669845581055, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 1.209335621446371, "epoch": 0.02398, "frac_reward_zero_std": 0.375, "grad_norm": 0.029435433447360992, "kl": 0.3658106364309788, "learning_rate": 7.99799642249946e-06, "loss": -0.0276, "num_tokens": 30668294.0, "reward": 0.9581764936447144, "reward_std": 0.7520145773887634, "rewards/rollout_reward_func/mean": 0.9581764936447144, "rewards/rollout_reward_func/std": 0.7520144581794739, "sampling/importance_sampling_ratio/max": 1.0873949527740479, "sampling/importance_sampling_ratio/mean": 0.8984326124191284, "sampling/importance_sampling_ratio/min": 1.6157574691533227e-06, "sampling/sampling_logp_difference/max": 1.9173848628997803, "sampling/sampling_logp_difference/mean": 0.2322019636631012, "step": 1199, "step_time": 21.561389650014462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.212371814996004, "epoch": 0.024, "grad_norm": 0.02892310917377472, "kl": 0.36033446714282036, "learning_rate": 7.997992975869439e-06, "loss": -0.0276, "step": 1200, "step_time": 11.29077386696008 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 30668294, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }